[med-svn] [gadgetron] 01/13: Imported Upstream version 2.5.0

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Fri Feb 20 17:16:21 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository gadgetron.

commit 96f6025c890b05b733e43f8a1870ced4e4ea90f2
Author: Ghislain Vaillant <ghisvail at gmail.com>
Date:   Tue Apr 22 18:21:58 2014 +0100

    Imported Upstream version 2.5.0
---
 .gitignore                                         |    22 +
 CMakeLists.txt                                     |   204 +
 LICENSE                                            |    29 +
 README                                             |    19 +
 apps/CMakeLists.txt                                |     7 +
 apps/clients/CMakeLists.txt                        |     7 +
 apps/clients/mriclient/BlobFileWriter.h            |    91 +
 apps/clients/mriclient/CMakeLists.txt              |    52 +
 apps/clients/mriclient/HDF5ImageWriter.h           |    76 +
 apps/clients/mriclient/ImageWriter.h               |   113 +
 apps/clients/mriclient/gt_alive.cpp                |    61 +
 apps/clients/mriclient/isalive.xml                 |    55 +
 apps/clients/mriclient/main.cpp                    |   230 +
 apps/gadgetron/CMakeLists.txt                      |    60 +
 apps/gadgetron/EndGadget.h                         |    57 +
 apps/gadgetron/Gadget.h                            |   382 +
 apps/gadgetron/GadgetContainerMessage.h            |   118 +
 apps/gadgetron/GadgetMessageInterface.h            |   235 +
 apps/gadgetron/GadgetServerAcceptor.cpp            |    58 +
 apps/gadgetron/GadgetServerAcceptor.h              |    26 +
 apps/gadgetron/GadgetStreamController.cpp          |   459 +
 apps/gadgetron/GadgetStreamController.h            |    79 +
 apps/gadgetron/Gadgetron.h                         |    31 +
 apps/gadgetron/GadgetronExport.h                   |    38 +
 apps/gadgetron/gadgetron.xml.example               |     9 +
 apps/gadgetron/gadgetron_start.pl                  |    31 +
 apps/gadgetron/main.cpp                            |    94 +
 apps/gadgetron/schema/gadgetron.xsd                |    53 +
 .../templates/CMakeLists_GadgetLibraryExample.txt  |    55 +
 .../templates/gadgetronEXAMPLELIB_export.h         |    21 +
 apps/gadgetron/upstart/gadgetron.conf              |    36 +
 apps/gadgetron/webapp/gadgetron_web.conf           |    15 +
 apps/gadgetron/webapp/gadgetron_web_app.cfg        |     8 +
 apps/gadgetron/webapp/gadgetron_web_app.py         |   176 +
 apps/matlab/mexGT.h                                |   580 ++
 apps/standalone/CMakeLists.txt                     |     7 +
 apps/standalone/cpu/CMakeLists.txt                 |    35 +
 apps/standalone/cpu/denoising/2d/CMakeLists.txt    |    25 +
 apps/standalone/cpu/denoising/2d/denoise_TV.cpp    |   117 +
 apps/standalone/cpu/denoising/CMakeLists.txt       |     1 +
 apps/standalone/cpu/gtplus/CMakeLists.txt          |    56 +
 .../cpu/gtplus/Matlab_compute_coil_map_2D.cpp      |   136 +
 .../cpu/gtplus/Matlab_compute_coil_map_3D.cpp      |   137 +
 apps/standalone/cpu/registration/2d/CMakeLists.txt |    40 +
 .../cpu/registration/2d/Matlab_register_CK_2d.cpp  |   197 +
 .../cpu/registration/2d/register_CK_2d.cpp         |   121 +
 .../cpu/registration/2d/register_HS_2d.cpp         |   110 +
 apps/standalone/cpu/registration/3d/CMakeLists.txt |    11 +
 .../cpu/registration/3d/register_CK_3d.cpp         |   115 +
 apps/standalone/cpu/registration/CMakeLists.txt    |     9 +
 apps/standalone/gpu/CMakeLists.txt                 |    19 +
 apps/standalone/gpu/deblurring/2d/CMakeLists.txt   |    14 +
 apps/standalone/gpu/deblurring/2d/blur_2d.cpp      |   111 +
 apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp |   109 +
 apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp |   129 +
 apps/standalone/gpu/deblurring/3d/CMakeLists.txt   |    13 +
 apps/standalone/gpu/deblurring/3d/blur_3d.cpp      |   113 +
 apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp |   114 +
 apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp |   135 +
 apps/standalone/gpu/deblurring/CMakeLists.txt      |     2 +
 apps/standalone/gpu/denoising/2d/CMakeLists.txt    |     9 +
 apps/standalone/gpu/denoising/2d/denoise_TV.cpp    |   122 +
 apps/standalone/gpu/denoising/CMakeLists.txt       |     1 +
 apps/standalone/gpu/mri/CMakeLists.txt             |     2 +
 apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt     |    17 +
 apps/standalone/gpu/mri/nfft/2d/main_cg.cpp        |   138 +
 apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp      |   148 +
 apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp     |   145 +
 apps/standalone/gpu/mri/nfft/2d/main_sb.cpp        |   170 +
 apps/standalone/gpu/mri/nfft/CMakeLists.txt        |     2 +
 apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt   |     7 +
 apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp    |   148 +
 apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp   |   173 +
 apps/standalone/gpu/mri/sense/CMakeLists.txt       |     2 +
 .../gpu/mri/sense/noncartesian/CMakeLists.txt      |     5 +
 .../noncartesian/radial/2d_golden_ratio/.gitignore |     1 +
 .../radial/2d_golden_ratio/CMakeLists.txt          |    11 +
 .../radial/2d_golden_ratio/main_cg.cpp             |   289 +
 .../radial/2d_golden_ratio/main_gpbb.cpp           |   284 +
 .../radial/2d_golden_ratio/main_sbc.cpp            |   332 +
 .../radial/2d_golden_ratio_gui/CMakeLists.txt      |    31 +
 .../radial/2d_golden_ratio_gui/GLReconWidget.cpp   |   222 +
 .../radial/2d_golden_ratio_gui/GLReconWidget.h     |    56 +
 .../radial/2d_golden_ratio_gui/UIconstants.h       |    10 +
 .../radial/2d_golden_ratio_gui/main.cpp            |    19 +
 .../radialSenseAppBaseMainWidget.ui                |   572 ++
 .../radialSenseAppMainWidget.cpp                   |   690 ++
 .../2d_golden_ratio_gui/radialSenseAppMainWidget.h |   134 +
 .../radial/2d_golden_ratio_gui/reconBaseWidget.ui  |   303 +
 .../radial/2d_golden_ratio_gui/reconWidget.cpp     |     7 +
 .../radial/2d_golden_ratio_gui/reconWidget.h       |    13 +
 .../radial/2d_golden_ratio_kt/CMakeLists.txt       |     5 +
 .../radial/2d_golden_ratio_kt/main.cpp             |   314 +
 .../mri/sense/noncartesian/radial/CMakeLists.txt   |    10 +
 apps/standalone/gpu/registration/2d/CMakeLists.txt |    52 +
 .../gpu/registration/2d/register_CGHS_2d.cpp       |   134 +
 .../gpu/registration/2d/register_CK_2d.cpp         |   129 +
 .../gpu/registration/2d/register_HS_2d.cpp         |   122 +
 .../gpu/registration/2d/test_reg_sense_recon.cpp   |   568 +
 apps/standalone/gpu/registration/3d/CMakeLists.txt |    12 +
 .../gpu/registration/3d/register_CK_3d.cpp         |   124 +
 apps/standalone/gpu/registration/CMakeLists.txt    |     7 +
 cmake/CMakeLists.txt                               |    15 +
 cmake/FindACE.cmake                                |    90 +
 cmake/FindCULA.cmake                               |    63 +
 cmake/FindDCMTK.cmake                              |   175 +
 cmake/FindFFTW3.cmake                              |    93 +
 cmake/FindGLEW.cmake                               |    53 +
 cmake/FindGMatlab.cmake                            |   115 +
 cmake/FindGadgetron.cmake                          |    40 +
 cmake/FindIsmrmrd.cmake                            |    29 +
 cmake/FindMKL.cmake                                |    99 +
 cmake/FindNumPy.cmake                              |   102 +
 cmake/FindOctave.cmake                             |    84 +
 cmake/FindTinyXML.cmake                            |    26 +
 cmake/FindXSD.cmake                                |    68 +
 cmake/FindXalanC.cmake                             |    35 +
 cmake/FindXercesC.cmake                            |    37 +
 doc/.gitignore                                     |     7 +
 doc/CMakeLists.txt                                 |    30 +
 doc/doxygen/CMakeLists.txt                         |     8 +
 doc/doxygen/Doxyfile.in                            |  1757 ++++
 doc/manual/CMakeLists.txt                          |    36 +
 doc/manual/figs/Gadgetron.png                      |   Bin 0 -> 156640 bytes
 doc/manual/figs/Gadgetron.svg                      |  1736 ++++
 doc/manual/figs/architecture.png                   |   Bin 0 -> 185001 bytes
 doc/manual/figs/architecture.svg                   |   748 ++
 doc/manual/figs/arrayfileformat.png                |   Bin 0 -> 80429 bytes
 doc/manual/figs/arrayfileformat.svg                |   247 +
 doc/manual/figs/cgsense.png                        |   Bin 0 -> 106261 bytes
 doc/manual/figs/cgsense.svg                        |   671 ++
 doc/manual/figs/examplecgsenseresult.png           |   Bin 0 -> 131868 bytes
 doc/manual/figs/examplegrapparesult.png            |   Bin 0 -> 42909 bytes
 doc/manual/figs/examplelibresult.png               |   Bin 0 -> 18132 bytes
 doc/manual/figs/gadget.png                         |   Bin 0 -> 85585 bytes
 doc/manual/figs/gadget.svg                         |   573 ++
 doc/manual/figs/grappa.png                         |   Bin 0 -> 193964 bytes
 doc/manual/figs/grappa.svg                         |   594 ++
 doc/manual/figs/hdfview_image_view.png             |   Bin 0 -> 64541 bytes
 doc/manual/figs/hdfview_image_view_setting.png     |   Bin 0 -> 47890 bytes
 doc/manual/figs/hdfview_mri_testdata.png           |   Bin 0 -> 88694 bytes
 doc/manual/figs/math/HOWTO.txt                     |     5 +
 doc/manual/figs/math/lls.jpg                       |   Bin 0 -> 3155 bytes
 doc/manual/figs/math/lls.tex                       |    11 +
 doc/manual/figs/math/lls_form.jpg                  |   Bin 0 -> 12749 bytes
 doc/manual/figs/math/lls_form.tex                  |    11 +
 doc/manual/figs/math/sb.jpg                        |   Bin 0 -> 4591 bytes
 doc/manual/figs/math/sb.tex                        |    14 +
 doc/manual/figs/python.png                         |   Bin 0 -> 86776 bytes
 doc/manual/figs/python.svg                         |   635 ++
 doc/manual/figs/sense_cg.png                       |   Bin 0 -> 24654 bytes
 doc/manual/figs/sense_sbc.png                      |   Bin 0 -> 23004 bytes
 doc/manual/figs/shepp.png                          |   Bin 0 -> 1243 bytes
 doc/manual/figs/shepp_blurred.png                  |   Bin 0 -> 10894 bytes
 doc/manual/figs/shepp_deblurred_cg.png             |   Bin 0 -> 21952 bytes
 doc/manual/figs/shepp_deblurred_sb.png             |   Bin 0 -> 9121 bytes
 doc/manual/figs/shepp_denoised.png                 |   Bin 0 -> 5646 bytes
 doc/manual/figs/shepp_iteration.png                |   Bin 0 -> 50249 bytes
 doc/manual/figs/shepp_noisy.png                    |   Bin 0 -> 40779 bytes
 doc/manual/figs/simple2dft.png                     |   Bin 0 -> 56594 bytes
 doc/manual/figs/simple2dft.svg                     |   355 +
 doc/manual/gadgetron_manual.xml                    |  6472 ++++++++++++
 doc/website/Gadgetron.png                          |   Bin 0 -> 34116 bytes
 doc/website/index.html                             |   146 +
 .../GadgetronWindowsInstallation.ps1               |   Bin 0 -> 18112 bytes
 gadgets/.gitignore                                 |     1 +
 gadgets/CMakeLists.txt                             |    89 +
 gadgets/cartesian/CMakeLists.txt                   |    23 +
 gadgets/cartesian/CartesianToGenericGadget.cpp     |    97 +
 gadgets/cartesian/CartesianToGenericGadget.h       |    43 +
 gadgets/cartesian/gadgetron_cartesian_export.h     |    11 +
 gadgets/dicom/CMakeLists.txt                       |    43 +
 gadgets/dicom/DicomFinishGadget.cpp                |   824 ++
 gadgets/dicom/DicomFinishGadget.h                  |    66 +
 gadgets/dicom/DicomImageWriter.cpp                 |   105 +
 gadgets/dicom/DicomImageWriter.h                   |    22 +
 gadgets/dicom/dicom.xml                            |    59 +
 gadgets/dicom/gadgetron_dicom_export.h             |    15 +
 gadgets/grappa/CMakeLists.txt                      |    35 +
 gadgets/grappa/GrappaCalibrationBuffer.cpp         |   140 +
 gadgets/grappa/GrappaCalibrationBuffer.h           |   149 +
 gadgets/grappa/GrappaGadget.cpp                    |   362 +
 gadgets/grappa/GrappaGadget.h                      |    62 +
 gadgets/grappa/GrappaUnmixingGadget.cpp            |    67 +
 gadgets/grappa/GrappaUnmixingGadget.h              |    32 +
 gadgets/grappa/GrappaWeights.cpp                   |   112 +
 gadgets/grappa/GrappaWeights.h                     |    37 +
 gadgets/grappa/GrappaWeightsCalculator.cpp         |   258 +
 gadgets/grappa/GrappaWeightsCalculator.h           |    65 +
 gadgets/grappa/config/CMakeLists.txt               |     5 +
 gadgets/grappa/config/grappa.xml                   |   114 +
 gadgets/grappa/config/grappa_float.xml             |   118 +
 gadgets/grappa/config/grappa_unoptimized.xml       |    95 +
 gadgets/grappa/config/grappa_unoptimized_float.xml |    99 +
 gadgets/grappa/gadgetron_grappa_export.h           |    14 +
 gadgets/gtPlus/CMakeLists.txt                      |    99 +
 gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp  |    11 +
 gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h    |   246 +
 gadgets/gtPlus/GadgetMRIHeaders.cpp                |   262 +
 gadgets/gtPlus/GadgetMRIHeadersExt.cpp             |   428 +
 gadgets/gtPlus/GadgetMRIHeadersExt.h               |   231 +
 gadgets/gtPlus/GadgetronMrReconCommon.h            |    90 +
 gadgets/gtPlus/GtPlusAccumulatorGadget.cpp         |  1168 +++
 gadgets/gtPlus/GtPlusAccumulatorGadget.h           |   198 +
 gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.cpp    |   615 ++
 gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.h      |    42 +
 gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.cpp  |    55 +
 gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.h    |    32 +
 gadgets/gtPlus/GtPlusAccumulatorSLCGadget.cpp      |   403 +
 gadgets/gtPlus/GtPlusAccumulatorSLCGadget.h        |    40 +
 .../GtPlusAccumulatorWorkOrderTriggerGadget.cpp    |  2282 +++++
 .../GtPlusAccumulatorWorkOrderTriggerGadget.h      |   266 +
 gadgets/gtPlus/GtPlusGadgetExport.h                |    16 +
 gadgets/gtPlus/GtPlusGadgetImageArray.cpp          |   664 ++
 gadgets/gtPlus/GtPlusGadgetImageArray.h            |    72 +
 gadgets/gtPlus/GtPlusGadgetOpenMP.cpp              |   103 +
 gadgets/gtPlus/GtPlusGadgetOpenMP.h                |    27 +
 gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h        |   227 +
 gadgets/gtPlus/GtPlusRecon2DTGadget.cpp            |   427 +
 gadgets/gtPlus/GtPlusRecon2DTGadget.h              |    63 +
 gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp       |   506 +
 gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h         |    91 +
 gadgets/gtPlus/GtPlusRecon3DTGadget.cpp            |   436 +
 gadgets/gtPlus/GtPlusRecon3DTGadget.h              |   105 +
 gadgets/gtPlus/GtPlusReconGadget.cpp               |  1478 +++
 gadgets/gtPlus/GtPlusReconGadget.h                 |   271 +
 gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp         |   234 +
 gadgets/gtPlus/GtPlusReconJob2DTGadget.h           |   117 +
 gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp    |   790 ++
 gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h      |   195 +
 gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp         |   262 +
 gadgets/gtPlus/GtPlusReconJob3DTGadget.h           |   117 +
 gadgets/matlab/BaseGadget.m                        |    73 +
 gadgets/matlab/CMakeLists.txt                      |    48 +
 gadgets/matlab/MatlabCommandServer.java            |   129 +
 gadgets/matlab/MatlabGadget.cpp                    |   288 +
 gadgets/matlab/MatlabGadget.h                      |   196 +
 gadgets/matlab/accumulate_and_recon.m              |    88 +
 gadgets/matlab/gadgetron_matlab_export.h           |    23 +
 gadgets/matlab/mask_image.m                        |    27 +
 gadgets/matlab/matlab.xml                          |    72 +
 gadgets/matlab/scale.m                             |    22 +
 gadgets/moco/CMakeLists.txt                        |    76 +
 gadgets/moco/RegistrationAveragingGadget.h         |   328 +
 gadgets/moco/RegistrationScatteringGadget.h        |   375 +
 gadgets/moco/config/CMakeLists.txt                 |    13 +
 gadgets/moco/config/cpureg_cartesian_averaging.xml |   130 +
 gadgets/moco/config/gpureg_cartesian_averaging.xml |   130 +
 gadgets/moco/cpuRegistrationAveragingGadget.cpp    |    44 +
 gadgets/moco/cpuRegistrationAveragingGadget.h      |    28 +
 gadgets/moco/gadgetron_moco_export.h               |    14 +
 gadgets/moco/gpuRegistrationAveragingGadget.cpp    |    50 +
 gadgets/moco/gpuRegistrationAveragingGadget.h      |    27 +
 gadgets/moco/gpuRegistrationScatteringGadget.cpp   |    57 +
 gadgets/moco/gpuRegistrationScatteringGadget.h     |    28 +
 gadgets/mri_core/AccumulatorGadget.cpp             |   184 +
 gadgets/mri_core/AccumulatorGadget.h               |    36 +
 gadgets/mri_core/AcquisitionFinishGadget.cpp       |    27 +
 gadgets/mri_core/AcquisitionFinishGadget.h         |    26 +
 gadgets/mri_core/AcquisitionPassthroughGadget.cpp  |    23 +
 gadgets/mri_core/AcquisitionPassthroughGadget.h    |    24 +
 gadgets/mri_core/AutoScaleGadget.cpp               |    74 +
 gadgets/mri_core/AutoScaleGadget.h                 |    32 +
 gadgets/mri_core/CMakeLists.txt                    |   115 +
 gadgets/mri_core/CoilReductionGadget.cpp           |   123 +
 gadgets/mri_core/CoilReductionGadget.h             |    32 +
 gadgets/mri_core/CplxDumpGadget.cpp                |   139 +
 gadgets/mri_core/CplxDumpGadget.h                  |    33 +
 gadgets/mri_core/CropAndCombineGadget.cpp          |    70 +
 gadgets/mri_core/CropAndCombineGadget.h            |    25 +
 gadgets/mri_core/ExtractGadget.cpp                 |   117 +
 gadgets/mri_core/ExtractGadget.h                   |    62 +
 gadgets/mri_core/FFTGadget.cpp                     |    22 +
 gadgets/mri_core/FFTGadget.h                       |    24 +
 gadgets/mri_core/FloatToUShortGadget.cpp           |    83 +
 gadgets/mri_core/FloatToUShortGadget.h             |    35 +
 gadgets/mri_core/FlowPhaseSubtractionGadget.cpp    |   150 +
 gadgets/mri_core/FlowPhaseSubtractionGadget.h      |    38 +
 gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp        |     6 +
 gadgets/mri_core/GadgetIsmrmrdReadWrite.h          |   202 +
 gadgets/mri_core/GadgetMRIHeaders.h                |   128 +
 gadgets/mri_core/ImageFinishGadget.cpp             |    51 +
 gadgets/mri_core/ImageFinishGadget.h               |    45 +
 gadgets/mri_core/ImageWriterGadget.cpp             |    52 +
 gadgets/mri_core/ImageWriterGadget.h               |    50 +
 gadgets/mri_core/IsmrmrdDumpGadget.cpp             |   134 +
 gadgets/mri_core/IsmrmrdDumpGadget.h               |    36 +
 gadgets/mri_core/MRIImageWriter.cpp                |    89 +
 gadgets/mri_core/MRIImageWriter.h                  |    37 +
 gadgets/mri_core/MaxwellCorrectionGadget.cpp       |   144 +
 gadgets/mri_core/MaxwellCorrectionGadget.h         |    35 +
 gadgets/mri_core/NoiseAdjustGadget.cpp             |   130 +
 gadgets/mri_core/NoiseAdjustGadget.h               |    39 +
 gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp |   220 +
 gadgets/mri_core/NoiseAdjustGadget_unoptimized.h   |    35 +
 gadgets/mri_core/PCACoilGadget.cpp                 |   220 +
 gadgets/mri_core/PCACoilGadget.h                   |    44 +
 gadgets/mri_core/PartialFourierAdjustROGadget.cpp  |   145 +
 gadgets/mri_core/PartialFourierAdjustROGadget.h    |    32 +
 gadgets/mri_core/PhysioInterpolationGadget.cpp     |   259 +
 gadgets/mri_core/PhysioInterpolationGadget.h       |    44 +
 gadgets/mri_core/RemoveROOversamplingGadget.cpp    |    59 +
 gadgets/mri_core/RemoveROOversamplingGadget.h      |    22 +
 gadgets/mri_core/Spline.h                          |   129 +
 gadgets/mri_core/default.xml                       |    54 +
 gadgets/mri_core/default_optimized.xml             |   117 +
 gadgets/mri_core/default_short.xml                 |    70 +
 gadgets/mri_core/gadgetron_mricore_export.h        |    14 +
 gadgets/mri_core/ismrmrd_dump.xml                  |    36 +
 gadgets/octave/CMakeLists.txt                      |    43 +
 .../octave/GadgetronReturnIsmrmrdAcquisition.cpp   |   136 +
 gadgets/octave/GadgetronReturnIsmrmrdImage.cpp     |   108 +
 gadgets/octave/OctaveCommunicator.cpp              |    68 +
 gadgets/octave/OctaveCommunicator.h                |    40 +
 gadgets/octave/OctaveGadget.cpp                    |   232 +
 gadgets/octave/OctaveGadget.h                      |    95 +
 gadgets/octave/XMLGetXPath.cpp                     |    35 +
 gadgets/octave/gadgetron_octave_export.h           |    23 +
 .../octave/gadgetron_octavecommunicator_export.h   |    23 +
 gadgets/octave/octave.xml                          |    84 +
 gadgets/octave/octave/accumulator.m                |    29 +
 gadgets/octave/octave/configure_accumulator.m      |    18 +
 gadgets/octave/octave/configure_downsample_2x.m    |     3 +
 gadgets/octave/octave/downsample_2x.m              |    11 +
 .../octave/octave/gadget_reference_accumulator.m   |     7 +
 .../octave/octave/gadget_reference_downsample_2x.m |     7 +
 .../octave/ismrm_transform_image_to_kspace.m       |    34 +
 .../octave/ismrm_transform_kspace_to_image.m       |    35 +
 gadgets/octave/octave/my_config_function.m         |     3 +
 gadgets/octave/octave/my_gadget_reference.m        |     7 +
 gadgets/octave/octave/my_recon_function.m          |     8 +
 gadgets/octave/pugiconfig.hpp                      |    69 +
 gadgets/octave/pugixml.cpp                         | 10250 +++++++++++++++++++
 gadgets/octave/pugixml.hpp                         |  1265 +++
 gadgets/python/CMakeLists.txt                      |    61 +
 gadgets/python/GadgetReference.cpp                 |    96 +
 gadgets/python/GadgetReference.h                   |    33 +
 gadgets/python/GadgetronPythonMRI.cpp              |   442 +
 gadgets/python/GadgetronXML.py                     |    59 +
 gadgets/python/PythonCommunicator.cpp              |   212 +
 gadgets/python/PythonCommunicator.h                |    50 +
 gadgets/python/PythonGadget.cpp                    |     6 +
 gadgets/python/PythonGadget.h                      |    99 +
 gadgets/python/accumulate_and_recon.py             |    83 +
 gadgets/python/gadgetronpython_export.h            |    14 +
 gadgets/python/image_viewer.py                     |    79 +
 gadgets/python/kspaceandimage.py                   |    20 +
 gadgets/python/python.xml                          |    84 +
 gadgets/python/python_short.xml                    |    98 +
 gadgets/python/remove_2x_oversampling.py           |    31 +
 gadgets/python/rms_coil_combine.py                 |    19 +
 gadgets/radial/CMakeLists.txt                      |    39 +
 gadgets/radial/config/CMakeLists.txt               |    28 +
 .../config/fixed_radial_mode0_gpu_ktsense.xml      |   157 +
 .../config/fixed_radial_mode0_gpusense_cg.xml      |   153 +
 .../fixed_radial_mode0_gpusense_cg_unoptimized.xml |   140 +
 .../config/fixed_radial_mode0_gpusense_sb.xml      |   163 +
 .../fixed_radial_mode0_gpusense_sb_unoptimized.xml |   150 +
 .../radial/config/fixed_radial_mode0_realtime.xml  |   149 +
 .../config/fixed_radial_mode1_gpu_ktsense.xml      |   157 +
 .../config/fixed_radial_mode1_gpusense_cg.xml      |   153 +
 .../fixed_radial_mode1_gpusense_cg_unoptimized.xml |   140 +
 .../config/fixed_radial_mode1_gpusense_sb.xml      |   163 +
 .../fixed_radial_mode1_gpusense_sb_unoptimized.xml |   150 +
 .../radial/config/fixed_radial_mode1_realtime.xml  |   149 +
 .../config/golden_radial_mode2_gpu_ktsense.xml     |   159 +
 .../config/golden_radial_mode2_gpusense_cg.xml     |   155 +
 ...golden_radial_mode2_gpusense_cg_unoptimized.xml |   142 +
 .../config/golden_radial_mode2_gpusense_sb.xml     |   165 +
 ...golden_radial_mode2_gpusense_sb_unoptimized.xml |   152 +
 .../radial/config/golden_radial_mode2_realtime.xml |   150 +
 .../config/golden_radial_mode3_gpusense_sb.xml     |   165 +
 gadgets/radial/gadgetron_radial_export.h           |    14 +
 gadgets/radial/gpuRadialSensePrepGadget.cpp        |   998 ++
 gadgets/radial/gpuRadialSensePrepGadget.h          |   191 +
 gadgets/sense/CMakeLists.txt                       |    43 +
 gadgets/sense/SenseJob.h                           |    27 +
 gadgets/sense/config/CMakeLists.txt                |    15 +
 .../config/generic_gpu_ktsense_singleshot.xml      |   115 +
 gadgets/sense/config/generic_gpusense_cg.xml       |   113 +
 .../config/generic_gpusense_cg_singleshot.xml      |   115 +
 .../config/generic_gpusense_sb_singleshot.xml      |   119 +
 gadgets/sense/gadgetron_gpusense_export.h          |    14 +
 gadgets/sense/gpuCgKtSenseGadget.cpp               |   370 +
 gadgets/sense/gpuCgKtSenseGadget.h                 |    71 +
 gadgets/sense/gpuCgSenseGadget.cpp                 |   321 +
 gadgets/sense/gpuCgSenseGadget.h                   |    71 +
 gadgets/sense/gpuGenericSensePrepGadget.cpp        |   948 ++
 gadgets/sense/gpuGenericSensePrepGadget.h          |   127 +
 gadgets/sense/gpuSbSenseGadget.cpp                 |   426 +
 gadgets/sense/gpuSbSenseGadget.h                   |    85 +
 gadgets/spiral/CMakeLists.txt                      |    39 +
 gadgets/spiral/SpiralToGenericGadget.cpp           |   225 +
 gadgets/spiral/SpiralToGenericGadget.h             |    50 +
 gadgets/spiral/config/CMakeLists.txt               |    16 +
 .../config/spiral_flow_generic_gpusense_cg.xml     |   149 +
 .../config/spiral_flow_generic_gpusense_sb.xml     |   159 +
 gadgets/spiral/config/spiral_flow_gpusense_cg.xml  |   141 +
 .../spiral/config/spiral_flow_gpusense_cg_ecg.xml  |   149 +
 .../config/spiral_flow_gpusense_cg_unoptimized.xml |   123 +
 gadgets/spiral/config/spiral_flow_gpusense_sb.xml  |   150 +
 .../config/spiral_flow_gpusense_sb_unoptimized.xml |   129 +
 gadgets/spiral/config/spiral_interactive.xml       |   124 +
 gadgets/spiral/gadgetron_spiral_export.h           |    11 +
 gadgets/spiral/gpuSpiralSensePrepGadget.cpp        |   700 ++
 gadgets/spiral/gpuSpiralSensePrepGadget.h          |    92 +
 gadgets/spiral/vds.cpp                             |   495 +
 gadgets/spiral/vds.h                               |    15 +
 test/CMakeLists.txt                                |    62 +
 test/cuNDArray_Vector_td_test.cpp                  |    50 +
 test/cuNDArray_blas_test.cpp                       |   156 +
 test/cuNDArray_elemwise_test.cpp                   |   379 +
 test/cuNDArray_operators_test.cpp                  |   243 +
 test/cuNDArray_test.cpp                            |    83 +
 test/cuNDArray_utils_test.cpp                      |   241 +
 test/cuVector_td_test_kernels.cu                   |   237 +
 test/cuVector_td_test_kernels.h                    |    18 +
 test/hoCuGTBLAS_test.cpp                           |    80 +
 test/hoCuNDArray_elemwise_test.cpp                 |   144 +
 test/hoNDArray_blas_test.cpp                       |   144 +
 test/hoNDArray_elemwise_test.cpp                   |   379 +
 test/hoNDArray_operators_test.cpp                  |   250 +
 test/hoNDArray_utils_test.cpp                      |   173 +
 test/tests.cpp                                     |    13 +
 test/vector_td_test.cpp                            |   141 +
 toolboxes/CMakeLists.txt                           |    18 +
 toolboxes/core/CMakeLists.txt                      |    26 +
 toolboxes/core/GadgetronCommon.h                   |    67 +
 toolboxes/core/GadgetronException.h                |    38 +
 toolboxes/core/GadgetronTimer.h                    |   109 +
 toolboxes/core/NDArray.h                           |   657 ++
 toolboxes/core/SerializableObject.h                |    27 +
 toolboxes/core/complext.h                          |   310 +
 toolboxes/core/core_defines.h.in                   |    28 +
 toolboxes/core/cpu/CMakeLists.txt                  |    84 +
 toolboxes/core/cpu/arma_math/CMakeLists.txt        |    48 +
 toolboxes/core/cpu/arma_math/cpucore_math_export.h |    22 +
 toolboxes/core/cpu/arma_math/hoArmadillo.h         |    84 +
 toolboxes/core/cpu/arma_math/hoNDArray_blas.cpp    |   648 ++
 toolboxes/core/cpu/arma_math/hoNDArray_blas.h      |   181 +
 .../core/cpu/arma_math/hoNDArray_elemwise.cpp      |  4810 +++++++++
 toolboxes/core/cpu/arma_math/hoNDArray_elemwise.h  |   400 +
 toolboxes/core/cpu/arma_math/hoNDArray_math.h      |     6 +
 .../core/cpu/arma_math/hoNDArray_operators.cpp     |   457 +
 toolboxes/core/cpu/arma_math/hoNDArray_operators.h |   239 +
 .../core/cpu/arma_math/hoNDArray_reductions.cpp    |    41 +
 .../core/cpu/arma_math/hoNDArray_reductions.h      |    12 +
 toolboxes/core/cpu/cpucore_export.h                |    22 +
 toolboxes/core/cpu/ho2DArray.h                     |    54 +
 toolboxes/core/cpu/ho2DArray.hxx                   |   261 +
 toolboxes/core/cpu/ho3DArray.h                     |    54 +
 toolboxes/core/cpu/ho3DArray.hxx                   |   287 +
 toolboxes/core/cpu/ho4DArray.h                     |    54 +
 toolboxes/core/cpu/ho4DArray.hxx                   |   313 +
 toolboxes/core/cpu/ho5DArray.h                     |    54 +
 toolboxes/core/cpu/ho5DArray.hxx                   |   345 +
 toolboxes/core/cpu/ho6DArray.h                     |    54 +
 toolboxes/core/cpu/ho6DArray.hxx                   |   392 +
 toolboxes/core/cpu/ho7DArray.h                     |    54 +
 toolboxes/core/cpu/ho7DArray.hxx                   |   427 +
 toolboxes/core/cpu/hoMatrix.cpp                    |  1309 +++
 toolboxes/core/cpu/hoMatrix.h                      |    77 +
 toolboxes/core/cpu/hoMatrix.hxx                    |   791 ++
 toolboxes/core/cpu/hoNDArray.h                     |   193 +
 toolboxes/core/cpu/hoNDArray.hxx                   |   980 ++
 toolboxes/core/cpu/hoNDArray_fileio.h              |    65 +
 toolboxes/core/cpu/hoNDArray_utils.h               |   485 +
 toolboxes/core/cpu/hoNDFFT.cpp                     |  1713 ++++
 toolboxes/core/cpu/hoNDFFT.h                       |   222 +
 toolboxes/core/cpu/hostutils/CMakeLists.txt        |    18 +
 toolboxes/core/cpu/hostutils/FileInfo.h            |    54 +
 toolboxes/core/cpu/hostutils/hostutils_export.h    |    22 +
 toolboxes/core/cpu/hostutils/parameterparser.cpp   |   330 +
 toolboxes/core/cpu/hostutils/parameterparser.h     |    81 +
 toolboxes/core/cpu/hostutils/url_encode.h          |    47 +
 toolboxes/core/gpu/CMakeLists.txt                  |    86 +
 toolboxes/core/gpu/CUBLASContextProvider.cpp       |   113 +
 toolboxes/core/gpu/CUBLASContextProvider.h         |    35 +
 toolboxes/core/gpu/GPUTimer.h                      |    75 +
 toolboxes/core/gpu/GadgetronCuException.h          |    15 +
 toolboxes/core/gpu/check_CUDA.h                    |    38 +
 toolboxes/core/gpu/cuNDArray.h                     |   701 ++
 toolboxes/core/gpu/cuNDArray_blas.cu               |   311 +
 toolboxes/core/gpu/cuNDArray_blas.h                |    49 +
 toolboxes/core/gpu/cuNDArray_elemwise.cu           |   656 ++
 toolboxes/core/gpu/cuNDArray_elemwise.h            |   241 +
 toolboxes/core/gpu/cuNDArray_kernels.cu            |   179 +
 toolboxes/core/gpu/cuNDArray_math.h                |     6 +
 toolboxes/core/gpu/cuNDArray_operators.cu          |   238 +
 toolboxes/core/gpu/cuNDArray_operators.h           |   167 +
 toolboxes/core/gpu/cuNDArray_reductions.cu         |   102 +
 toolboxes/core/gpu/cuNDArray_reductions.h          |    15 +
 toolboxes/core/gpu/cuNDArray_utils.cu              |   936 ++
 toolboxes/core/gpu/cuNDArray_utils.h               |    60 +
 toolboxes/core/gpu/cuNDFFT.cpp                     |   156 +
 toolboxes/core/gpu/cuNDFFT.h                       |    49 +
 toolboxes/core/gpu/cudaDeviceManager.cpp           |   223 +
 toolboxes/core/gpu/cudaDeviceManager.h             |    80 +
 toolboxes/core/gpu/gpucore_export.h                |    18 +
 toolboxes/core/gpu/hoCuNDArray.h                   |   121 +
 toolboxes/core/gpu/hoCuNDArray_blas.cu             |   260 +
 toolboxes/core/gpu/hoCuNDArray_blas.h              |    32 +
 toolboxes/core/gpu/hoCuNDArray_elemwise.h          |     8 +
 toolboxes/core/gpu/hoCuNDArray_math.h              |     6 +
 toolboxes/core/gpu/hoCuNDArray_operators.h         |     9 +
 toolboxes/core/gpu/hoCuNDArray_utils.h             |    18 +
 toolboxes/core/gpu/radial_utilities.cu             |   427 +
 toolboxes/core/gpu/radial_utilities.h              |    37 +
 toolboxes/core/gpu/real_utilities_device.h         |    22 +
 toolboxes/core/gpu/setup_grid.h                    |    36 +
 toolboxes/core/real_utilities.h                    |    72 +
 toolboxes/core/vector_td.h                         |   293 +
 toolboxes/core/vector_td_io.h                      |    49 +
 toolboxes/core/vector_td_operators.h               |   435 +
 toolboxes/core/vector_td_utilities.h               |   482 +
 toolboxes/gadgettools/CMakeLists.txt               |    58 +
 toolboxes/gadgettools/GadgetCloudController.h      |   654 ++
 toolboxes/gadgettools/GadgetImageMessageReader.h   |    71 +
 toolboxes/gadgettools/GadgetImageMessageWriter.h   |    84 +
 toolboxes/gadgettools/GadgetServerAcceptor.cpp     |    58 +
 toolboxes/gadgettools/GadgetServerAcceptor.h       |    27 +
 toolboxes/gadgettools/GadgetStreamController.cpp   |   459 +
 toolboxes/gadgettools/GadgetStreamController.h     |   559 +
 toolboxes/gadgettools/GadgetronCloudConnector.h    |   580 ++
 toolboxes/gadgettools/GadgetronConnector.cpp       |   306 +
 toolboxes/gadgettools/GadgetronConnector.h         |   172 +
 toolboxes/gadgettools/GadgetronSlotContainer.h     |    60 +
 toolboxes/gadgettools/demo.xml                     |    43 +
 toolboxes/gadgettools/gadgettools_export.h         |    20 +
 toolboxes/gadgettools/schema/gadgetron.xsd         |    53 +
 toolboxes/gadgettools/test_gadget_xml.cpp          |    32 +
 toolboxes/gtplus/CMakeLists.txt                    |   224 +
 toolboxes/gtplus/GtPlusExport.h                    |    20 +
 toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h   |    78 +
 .../gtplus/algorithm/gtPlusDataFidelityOperator.h  |   160 +
 toolboxes/gtplus/algorithm/gtPlusGRAPPA.h          |  1035 ++
 toolboxes/gtplus/algorithm/gtPlusOperator.h        |   238 +
 toolboxes/gtplus/algorithm/gtPlusSPIRIT.h          |  1048 ++
 .../gtplus/algorithm/gtPlusSPIRIT2DOperator.h      |   206 +
 .../gtplus/algorithm/gtPlusSPIRIT2DTOperator.h     |   329 +
 .../gtplus/algorithm/gtPlusSPIRIT3DOperator.h      |    98 +
 .../algorithm/gtPlusSPIRITNoNullSpace2DOperator.h  |    68 +
 .../algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h |   289 +
 .../algorithm/gtPlusSPIRITNoNullSpace3DOperator.h  |    64 +
 .../algorithm/gtPlusSPIRITNoNullSpaceOperator.h    |   130 +
 toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h  |   422 +
 .../gtplus/algorithm/gtPlusWavelet2DOperator.h     |   370 +
 .../gtplus/algorithm/gtPlusWavelet3DOperator.h     |  1170 +++
 .../algorithm/gtPlusWaveletNoNullSpace2DOperator.h |   118 +
 .../algorithm/gtPlusWaveletNoNullSpace3DOperator.h |   119 +
 toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h |   616 ++
 .../GadgetronProgram_gtPlus_2DT_Cartesian.xml      |   798 ++
 ...etronProgram_gtPlus_2DT_Cartesian_CloudNode.xml |    67 +
 ...us_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml |   808 ++
 ...Plus_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml |   808 ++
 ...m_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml |   269 +
 ...getronProgram_gtPlus_2DT_Cartesian_L1SPIRIT.xml |   799 ++
 ...adgetronProgram_gtPlus_2DT_Cartesian_SPIRIT.xml |   799 ++
 .../GadgetronProgram_gtPlus_2DT_FatWater.xml       |   654 ++
 .../config/GadgetronProgram_gtPlus_2DT_LGE.xml     |   654 ++
 .../config/GadgetronProgram_gtPlus_2DT_MOLLI.xml   |   654 ++
 .../GadgetronProgram_gtPlus_2DT_Perfusion.xml      |   655 ++
 .../GadgetronProgram_gtPlus_2DT_RealTimeCine.xml   |   741 ++
 .../GadgetronProgram_gtPlus_2DT_RealTimeFlow.xml   |   689 ++
 .../config/GadgetronProgram_gtPlus_2DT_T2W.xml     |   654 ++
 .../GadgetronProgram_gtPlus_3DT_Cartesian.xml      |   787 ++
 ...etronProgram_gtPlus_3DT_Cartesian_CloudNode.xml |    72 +
 ...getronProgram_gtPlus_3DT_Cartesian_L1SPIRIT.xml |   816 ++
 ...adgetronProgram_gtPlus_3DT_Cartesian_SPIRIT.xml |   795 ++
 toolboxes/gtplus/config/gtCloud/myCloud_2DT.txt    |     8 +
 .../config/gtCloud/myCloud_2DT_DualLayer.txt       |     8 +
 .../gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt   |     8 +
 toolboxes/gtplus/config/gtCloud/myCloud_3DT.txt    |    12 +
 toolboxes/gtplus/matlab/CMakeLists.txt             |    10 +
 toolboxes/gtplus/matlab/FtkMatlabConverterBase.h   |   569 +
 toolboxes/gtplus/matlab/gtMatlab.h                 |    37 +
 toolboxes/gtplus/matlab/gtMatlabConverter.h        |   235 +
 toolboxes/gtplus/matlab/gtMatlabConverterComplex.h |   155 +
 toolboxes/gtplus/solver/gtPlusLSQRSolver.h         |   294 +
 toolboxes/gtplus/solver/gtPlusLinearSolver.h       |    93 +
 toolboxes/gtplus/solver/gtPlusNCGSolver.h          |   380 +
 toolboxes/gtplus/solver/gtPlusNonLinearSolver.h    |   122 +
 toolboxes/gtplus/solver/gtPlusSolver.h             |   150 +
 toolboxes/gtplus/ut/CMakeLists.txt                 |    74 +
 toolboxes/gtplus/ut/grappa_test.cpp                |   613 ++
 toolboxes/gtplus/ut/gtplus_ut.cpp                  |    16 +
 toolboxes/gtplus/ut/spirit_test.cpp                |   425 +
 toolboxes/gtplus/ut/util_test.cpp                  |  1195 +++
 toolboxes/gtplus/util/gtPlusIOAnalyze.cpp          |   252 +
 toolboxes/gtplus/util/gtPlusIOAnalyze.h            |   652 ++
 toolboxes/gtplus/util/gtPlusIOBase.cpp             |   200 +
 toolboxes/gtplus/util/gtPlusIOBase.h               |   129 +
 toolboxes/gtplus/util/gtPlusMemoryManager.cpp      |   388 +
 toolboxes/gtplus/util/gtPlusMemoryManager.h        |   139 +
 toolboxes/gtplus/util/hoNDArrayMemoryManaged.h     |   144 +
 toolboxes/gtplus/util/hoNDArrayMemoryManaged.hxx   |   377 +
 toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp |   157 +
 toolboxes/gtplus/workflow/gtPlusCloudScheduler.h   |    54 +
 .../gtplus/workflow/gtPlusISMRMRDReconUtil.cpp     |    18 +
 toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h |   534 +
 .../gtplus/workflow/gtPlusISMRMRDReconUtil.hxx     |  5645 ++++++++++
 .../gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h   |   525 +
 .../workflow/gtPlusISMRMRDReconWorkFlowCartesian.h |  1082 ++
 .../gtPlusISMRMRDReconWorkFlowCartesian2DT.h       |   277 +
 .../gtPlusISMRMRDReconWorkFlowCartesian3DT.h       |   247 +
 .../gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h  |   871 ++
 .../workflow/gtPlusISMRMRDReconWorkOrder2DT.h      |   382 +
 .../workflow/gtPlusISMRMRDReconWorkOrder3DT.h      |   355 +
 .../gtplus/workflow/gtPlusISMRMRDReconWorker.h     |   614 ++
 .../gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h  |  2532 +++++
 .../workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h   |   316 +
 .../gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h       |   348 +
 .../gtPlusISMRMRDReconWorker2DTNoAcceleration.h    |   146 +
 .../workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h   |   685 ++
 .../gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h  |  2749 +++++
 .../workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h   |   621 ++
 .../gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h       |   778 ++
 .../gtPlusISMRMRDReconWorker3DTNoAcceleration.h    |   157 +
 .../workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h   |  1028 ++
 toolboxes/mri/CMakeLists.txt                       |     1 +
 toolboxes/mri/pmri/CMakeLists.txt                  |     3 +
 toolboxes/mri/pmri/gpu/CMakeLists.txt              |    73 +
 toolboxes/mri/pmri/gpu/b1_map.cu                   |   733 ++
 toolboxes/mri/pmri/gpu/b1_map.h                    |    32 +
 toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu       |   647 ++
 toolboxes/mri/pmri/gpu/b1map_test.cu               |    48 +
 toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu |   133 +
 toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h  |    39 +
 .../mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu  |    41 +
 .../mri/pmri/gpu/cuNonCartesianKtSenseOperator.h   |    30 +
 .../mri/pmri/gpu/cuNonCartesianSenseOperator.cu    |   116 +
 .../mri/pmri/gpu/cuNonCartesianSenseOperator.h     |    50 +
 toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp           |   224 +
 toolboxes/mri/pmri/gpu/cuSenseBuffer.h             |    61 +
 toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp         |    89 +
 toolboxes/mri/pmri/gpu/cuSenseBufferCg.h           |    39 +
 toolboxes/mri/pmri/gpu/cuSenseOperator.cu          |    32 +
 toolboxes/mri/pmri/gpu/cuSenseOperator.h           |    31 +
 toolboxes/mri/pmri/gpu/gpupmri_export.h            |    19 +
 toolboxes/mri/pmri/gpu/htgrappa.cu                 |   827 ++
 toolboxes/mri/pmri/gpu/htgrappa.h                  |    29 +
 toolboxes/mri/pmri/gpu/htgrappa_test.cpp           |    64 +
 toolboxes/mri/pmri/gpu/senseOperator.h             |    48 +
 toolboxes/mri/pmri/gpu/sense_utilities.cu          |   146 +
 toolboxes/mri/pmri/gpu/sense_utilities.h           |    25 +
 toolboxes/nfft/CMakeLists.txt                      |     3 +
 toolboxes/nfft/gpu/CMakeLists.txt                  |    43 +
 toolboxes/nfft/gpu/KaiserBessel_kernel.cu          |   127 +
 toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu        |   249 +
 toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu |   227 +
 toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu        |   140 +
 toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu       |   171 +
 toolboxes/nfft/gpu/cuNFFT.cu                       |  1457 +++
 toolboxes/nfft/gpu/cuNFFT.h                        |   284 +
 toolboxes/nfft/gpu/cuNFFTOperator.cu               |   113 +
 toolboxes/nfft/gpu/cuNFFTOperator.h                |    39 +
 toolboxes/nfft/gpu/gpunfft_export.h                |    19 +
 toolboxes/operators/CMakeLists.txt                 |    31 +
 toolboxes/operators/FFTOperator.h                  |    70 +
 toolboxes/operators/convolutionOperator.h          |   220 +
 toolboxes/operators/cpu/CMakeLists.txt             |    22 +
 toolboxes/operators/cpu/hoFFTOperator.h            |    29 +
 toolboxes/operators/cpu/hoIdentityOperator.h       |    28 +
 toolboxes/operators/cpu/hoImageOperator.h          |    58 +
 .../operators/cpu/hoPartialDerivativeOperator.h    |   110 +
 toolboxes/operators/cpu/hoTvOperator.h             |   117 +
 toolboxes/operators/cpu/hoTvPicsOperator.h         |    16 +
 toolboxes/operators/diagonalOperator.h             |    78 +
 toolboxes/operators/downsampleOperator.h           |    51 +
 toolboxes/operators/encodedImageOperator.h         |    48 +
 toolboxes/operators/encodingOperatorContainer.h    |   235 +
 toolboxes/operators/generalOperator.h              |    89 +
 toolboxes/operators/gpu/CMakeLists.txt             |    51 +
 toolboxes/operators/gpu/cuConvolutionOperator.cu   |    89 +
 toolboxes/operators/gpu/cuConvolutionOperator.h    |    33 +
 toolboxes/operators/gpu/cuDiagonalOperator.h       |    20 +
 toolboxes/operators/gpu/cuDownsampleOperator.h     |    28 +
 toolboxes/operators/gpu/cuFFTOperator.h            |    29 +
 toolboxes/operators/gpu/cuIdentityOperator.h       |    28 +
 toolboxes/operators/gpu/cuImageOperator.h          |    69 +
 toolboxes/operators/gpu/cuLaplaceOperator.cu       |    95 +
 toolboxes/operators/gpu/cuLaplaceOperator.h        |    28 +
 .../gpu/cuMultiplicationOperatorContainer.h        |    23 +
 .../operators/gpu/cuPartialDerivativeOperator.cu   |   145 +
 .../operators/gpu/cuPartialDerivativeOperator.h    |    38 +
 toolboxes/operators/gpu/cuTv1dOperator.cu          |   129 +
 toolboxes/operators/gpu/cuTv1dOperator.h           |    38 +
 toolboxes/operators/gpu/cuTvOperator.cu            |   132 +
 toolboxes/operators/gpu/cuTvOperator.h             |    42 +
 toolboxes/operators/gpu/cuTvPicsOperator.h         |    16 +
 toolboxes/operators/gpu/cuUpsampleOperator.h       |    28 +
 toolboxes/operators/gpu/gpuoperators_export.h      |    18 +
 .../operators/gpu/hoCuEncodingOperatorContainer.h  |    22 +
 toolboxes/operators/gpu/hoCuOperator.h             |    55 +
 toolboxes/operators/gpu/hoCuTvOperator.h           |    84 +
 toolboxes/operators/gpu/hoCuTvPicsOperator.h       |    16 +
 toolboxes/operators/identityOperator.h             |    55 +
 toolboxes/operators/imageOperator.h                |    99 +
 toolboxes/operators/laplaceOperator.h              |    31 +
 toolboxes/operators/linearOperator.h               |   108 +
 .../operators/multiplicationOperatorContainer.h    |   200 +
 toolboxes/operators/partialDerivativeOperator.h    |    71 +
 toolboxes/operators/tvPicsOperator.h               |    44 +
 toolboxes/operators/upsampleOperator.h             |    51 +
 toolboxes/registration/CMakeLists.txt              |     1 +
 toolboxes/registration/optical_flow/CMakeLists.txt |    29 +
 .../registration/optical_flow/cpu/CMakeLists.txt   |    35 +
 .../registration/optical_flow/cpu/cpureg_export.h  |    18 +
 .../optical_flow/cpu/hoCKOpticalFlowSolver.cpp     |   297 +
 .../optical_flow/cpu/hoCKOpticalFlowSolver.h       |    55 +
 .../optical_flow/cpu/hoHSOpticalFlowSolver.cpp     |   286 +
 .../optical_flow/cpu/hoHSOpticalFlowSolver.h       |    52 +
 .../optical_flow/cpu/hoLinearResampleOperator.cpp  |   203 +
 .../optical_flow/cpu/hoLinearResampleOperator.h    |    35 +
 .../cpu/hoLinearResampleOperator_eigen.cpp         |   206 +
 .../cpu/hoLinearResampleOperator_eigen.h           |    40 +
 .../optical_flow/cpu/hoOpticalFlowSolver.cpp       |   183 +
 .../optical_flow/cpu/hoOpticalFlowSolver.h         |    48 +
 .../optical_flow/cpu/hoRegistration_utils.cpp      |   233 +
 .../optical_flow/cpu/hoRegistration_utils.h        |    13 +
 .../registration/optical_flow/gpu/CMakeLists.txt   |    36 +
 .../registration/optical_flow/gpu/cuCGHSOFSolver.h |    67 +
 .../optical_flow/gpu/cuCKOpticalFlowSolver.cu      |   340 +
 .../optical_flow/gpu/cuCKOpticalFlowSolver.h       |    55 +
 .../optical_flow/gpu/cuHSOpticalFlowSolver.cu      |   326 +
 .../optical_flow/gpu/cuHSOpticalFlowSolver.h       |    52 +
 .../optical_flow/gpu/cuLinearResampleOperator.cu   |   265 +
 .../optical_flow/gpu/cuLinearResampleOperator.h    |    26 +
 .../optical_flow/gpu/cuOpticalFlowSolver.cu        |   303 +
 .../optical_flow/gpu/cuOpticalFlowSolver.h         |    50 +
 .../optical_flow/gpu/cuResampleOperator.cu         |   107 +
 .../optical_flow/gpu/cuResampleOperator.h          |    42 +
 .../optical_flow/gpu/cuResampleOperator_macros.h   |   248 +
 .../registration/optical_flow/gpu/gpureg_export.h  |    14 +
 .../optical_flow/multiresRegistrationSolver.h      |   263 +
 .../optical_flow/opticalFlowOperator.h             |    72 +
 .../registration/optical_flow/opticalFlowSolver.h  |   176 +
 .../registration/optical_flow/registrationSolver.h |   103 +
 .../registration/optical_flow/resampleOperator.h   |    42 +
 toolboxes/solvers/CMakeLists.txt                   |    27 +
 toolboxes/solvers/cgCallback.h                     |   198 +
 toolboxes/solvers/cgPreconditioner.h               |    50 +
 toolboxes/solvers/cgSolver.h                       |   412 +
 toolboxes/solvers/cpu/CMakeLists.txt               |    15 +
 toolboxes/solvers/cpu/hoCgPreconditioner.h         |    14 +
 toolboxes/solvers/cpu/hoCgSolver.h                 |    30 +
 toolboxes/solvers/cpu/hoGpBbSolver.h               |    38 +
 toolboxes/solvers/cpu/hoSbCgSolver.h               |    16 +
 toolboxes/solvers/gpBbSolver.h                     |   199 +
 toolboxes/solvers/gpSolver.h                       |   318 +
 toolboxes/solvers/gpu/CMakeLists.txt               |    49 +
 toolboxes/solvers/gpu/cuCgPreconditioner.h         |    14 +
 toolboxes/solvers/gpu/cuCgSolver.h                 |    30 +
 toolboxes/solvers/gpu/cuGpBbSolver.cu              |    40 +
 toolboxes/solvers/gpu/cuGpBbSolver.h               |    26 +
 toolboxes/solvers/gpu/cuLwSolver.h                 |    42 +
 toolboxes/solvers/gpu/cuSbCgSolver.h               |    16 +
 toolboxes/solvers/gpu/cuSbLwSolver.h               |    27 +
 toolboxes/solvers/gpu/cuSbcCgSolver.h              |    14 +
 toolboxes/solvers/gpu/cuSbcLwSolver.h              |    29 +
 toolboxes/solvers/gpu/gpusolvers_export.h          |    18 +
 toolboxes/solvers/gpu/hoCuGpBbSolver.h             |    38 +
 toolboxes/solvers/linearOperatorSolver.h           |    75 +
 toolboxes/solvers/lwSolver.h                       |   214 +
 toolboxes/solvers/sbSolver.h                       |   838 ++
 toolboxes/solvers/sbcSolver.h                      |    96 +
 toolboxes/solvers/solver.h                         |    51 +
 767 files changed, 164406 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fd1fe60
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+*~
+*.so
+*.dll
+*.o
+*.cuo
+*#
+*.dylib
+*.real
+*.cplx
+inc/
+*.d
+!CMakeLists.txt
+bin/
+lib/*.py
+lib/*.pyc
+.DS_Store
+*.swp
+build/
+*.pyc
+toolboxes/core/core_defines.h
+prod/
+external/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..f43f7a2
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,204 @@
+cmake_minimum_required(VERSION 2.8)
+project(GADGETRON)
+
+# build options for 64 bits system
+if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  message(" 64bit system is found  ... ")
+  set( HAS_64_BIT On CACHE BOOL "64bit build")
+else( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  message(" 32bit system is found  ... ")
+  set( HAS_64_BIT Off CACHE BOOL "64bit build")
+endif( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+
+# build options for OpenMP support
+find_package(OpenMP)
+OPTION(USE_OPENMP "Use OpenMP" On)
+if (OPENMP_FOUND)
+    if(USE_OPENMP) 
+        message("OpenMP multithreading enabled")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        ADD_DEFINITIONS(-DUSE_OMP)
+    else (USE_OPENMP)
+        message("OpenMP multithreading is supported, but disabled")
+    endif(USE_OPENMP) 
+else (OPENMP_FOUND)
+  message("OpenMP multithreading not supported")
+endif (OPENMP_FOUND)
+
+if (WIN32)
+    ADD_DEFINITIONS(-DWIN32 -D_WIN32 -D_WINDOWS)
+    ADD_DEFINITIONS(-DUNICODE -D_UNICODE)
+    ADD_DEFINITIONS(-D_CRT_SECURE_NO_WARNINGS)
+    if ( HAS_64_BIT )
+        ADD_DEFINITIONS(-DWIN64 -D_WIN64)
+    endif ( HAS_64_BIT )
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP")
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3")
+else (WIN32)
+    if (UNIX)
+        if (APPLE)
+        else (APPLE)
+            SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+            SET (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libgfortran")
+        endif (APPLE)
+  endif (UNIX)
+endif (WIN32)
+
+set(CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}/gadgetron)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+
+# uncomment these if all compiled targets are to be stored in the same directory
+#SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/bin CACHE STRING "Where to put the executables")
+#SET(LIBRARY_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/bin CACHE STRING "Where to put the libraries")
+
+# whether to suppress compilation warnings
+OPTION(BUILD_SUPPRESS_WARNINGS "Build package while suppressing warnings" Off)
+if (BUILD_SUPPRESS_WARNINGS)
+    if (WIN32)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W0")
+    elseif (WIN32)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -w")
+    endif (WIN32)
+endif (BUILD_SUPPRESS_WARNINGS)
+
+# whether to compile toolboxes as static library
+OPTION(BUILD_TOOLBOX_STATIC "Build static library for toolboxes" Off)
+
+if ( BUILD_TOOLBOX_STATIC )
+    message("Build static toolbox libray ... ")
+    ADD_DEFINITIONS(-DBUILD_TOOLBOX_STATIC)
+    set(LIBTYPE STATIC)
+    if ( CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX )
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic")
+    endif ( CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX )
+else ( BUILD_TOOLBOX_STATIC )
+    message("Build dynamic toolbox libray ... ")
+    set(LIBTYPE SHARED)
+endif ( BUILD_TOOLBOX_STATIC )
+
+set(Boost_USE_MULTITHREADED ON)
+set(Boost_USE_STATIC_RUNTIME OFF)
+set(Boost_NO_BOOST_CMAKE ON)
+
+# We actually only use system and thread explicitly, but they require linking in date_time and chrono...
+if (WIN32)
+  find_package(Boost COMPONENTS system thread date_time chrono REQUIRED)
+else(WIN32)
+  find_package(Boost COMPONENTS system thread REQUIRED)
+endif(WIN32)
+
+find_package(FFTW3 COMPONENTS single double REQUIRED)
+
+find_package(ACE)
+if(ACE_FOUND)
+  MESSAGE("ACE found, the streaming framework will be compiled.")
+else(ACE_FOUND)
+  MESSAGE("ACE not found. Only toolboxes and standalone applications are compiled. The streaming framework will not be compiled.")
+endif(ACE_FOUND)
+
+find_package(CUDA 4.1)
+if ( CUDA_FOUND )
+    ADD_DEFINITIONS(-DUSE_CUDA)
+endif ( CUDA_FOUND )
+
+find_package(GTest)
+#Add support for the default ubuntu package of gtest (which is not compiled
+if (NOT GTEST_FOUND)
+  find_path(GTEST_SRC_DIR src/gtest.cc HINTS /usr/src/gtest)
+  find_path(GTEST_INCLUDE_DIRS gtest.h HINTS /usr/include/gtest)
+  if (GTEST_SRC_DIR AND GTEST_INCLUDE_DIRS)
+    MESSAGE("GTest src package found. Compiling as part of Gadgetron.")
+    add_subdirectory(${GTEST_SRC_DIR} ${CMAKE_BINARY_DIR}/gtest )
+    include_directories(${GTEST_INCLUDE_DIRS})
+    set(GTEST_FOUND 1)
+    set(GTEST_LIBRARIES gtest gtest_main)
+  endif (GTEST_SRC_DIR AND GTEST_INCLUDE_DIRS)
+endif (NOT GTEST_FOUND)
+
+find_package(Armadillo)
+# check whether ILP64 MKL should is used
+if(ARMADILLO_FOUND)
+    set(ARMADILLO_BLAS_LONG_LONG FALSE)
+    if(EXISTS "${ARMADILLO_INCLUDE_DIR}/armadillo_bits/config.hpp")
+        # Read and parse armadillo config.hpp to find out whether BLAS uses long long
+        file(STRINGS "${ARMADILLO_INCLUDE_DIR}/armadillo_bits/config.hpp" _armadillo_blas_long_long REGEX "// #define ARMA_BLAS_LONG_LON")
+        if ( NOT _armadillo_blas_long_long )
+            set(ARMADILLO_BLAS_LONG_LONG TRUE)
+            MESSAGE("Armadillo is found to use long long for BLAS calls ... ")
+        else ( NOT _armadillo_blas_long_long )
+            MESSAGE("Armadillo is found NOT to use long long for BLAS calls ... ")
+            MESSAGE("Note the ARMADILLO_BLAS_LONG_LONG must be defined in the ${ARMADILLO_INCLUDE_DIR}/armadillo_bits/config.hpp to link against MKL ILP64 interface ... ")
+        endif ( NOT _armadillo_blas_long_long )
+        unset(_armadillo_blas_long_long)
+    endif()
+endif ()
+
+find_package(MKL)
+
+if (CUDA_FOUND)
+  MESSAGE("CUDA found, GPU components will be compiled.")
+  SET( GADGETRON_CUDA_FOUND_BOOL 1 )
+  include_directories( ${CUDA_INCLUDE_DIRS} )
+  #set(CUDA_VERBOSE_BUILD ON)
+
+  # Compile kernels for compute models 1.0 and 2.0 as default for Cuda 4.1
+  # Support compute model 3.0 from Cuda 4.2 and up
+  # Support compute model 3.5 from Cuda 5 and up
+
+  set(CUDA_NVCC_FLAGS1 "-gencode arch=compute_10,code=sm_10")
+  set(CUDA_NVCC_FLAGS2 "-gencode arch=compute_20,code=sm_20")
+  set(CUDA_NVCC_FLAGS3 "-gencode arch=compute_30,code=sm_30") 
+  set(CUDA_NVCC_FLAGS4 "-gencode arch=compute_35,code=sm_35")   
+
+  if(${CUDA_VERSION} VERSION_GREATER "4.99")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS1} ${CUDA_NVCC_FLAGS2} ${CUDA_NVCC_FLAGS3} ${CUDA_NVCC_FLAGS4})
+  else(${CUDA_VERSION} VERSION_GREATER "4.99")    
+
+    if(${CUDA_VERSION} VERSION_GREATER "4.1")
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS1} ${CUDA_NVCC_FLAGS2} ${CUDA_NVCC_FLAGS3})
+    else(${CUDA_VERSION} VERSION_GREATER "4.1")      
+
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS1} ${CUDA_NVCC_FLAGS2})          
+
+    endif(${CUDA_VERSION} VERSION_GREATER "4.1")
+  endif(${CUDA_VERSION} VERSION_GREATER "4.99")
+
+else (CUDA_FOUND)
+  MESSAGE("CUDA not found. CUDA components will not be compiled.")
+  SET( GADGETRON_CUDA_FOUND_BOOL 0 )
+endif (CUDA_FOUND)
+
+find_package(Qt4 4.6)
+find_package(PythonLibs)
+find_package(NumPy)
+find_package(GLEW)
+find_package(OpenGL)
+find_package(GLUT)
+find_package(HDF5 1.8 COMPONENTS C CXX)
+
+find_package(Ismrmrd)
+if(ISMRMRD_FOUND)
+  message("ISMRMRD found")
+  find_package(XSD REQUIRED)
+  find_package(XercesC REQUIRED)
+else(ISMRMRD_FOUND)
+  message("ISMRMRD not found. Only compiling toolboxes and standalone applications.")
+endif(ISMRMRD_FOUND)
+
+find_package(GMatlab)
+
+include_directories( ${CMAKE_SOURCE_DIR} )
+
+add_subdirectory(toolboxes)
+add_subdirectory(apps)
+if (ACE_FOUND AND ISMRMRD_FOUND)
+  add_subdirectory(gadgets)
+endif (ACE_FOUND AND ISMRMRD_FOUND)
+
+add_subdirectory(cmake)
+add_subdirectory(doc)
+
+if (GTEST_FOUND AND ARMADILLO_FOUND)
+  add_subdirectory(test)
+endif (GTEST_FOUND AND ARMADILLO_FOUND)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f704983
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+GADGETRON SOFTWARE LICENSE V1.0, NOVEMBER 2011
+
+PERMISSION IS HEREBY GRANTED, FREE OF CHARGE, TO ANY PERSON OBTAINING
+A COPY OF THIS SOFTWARE AND ASSOCIATED DOCUMENTATION FILES (THE
+"SOFTWARE"), TO DEAL IN THE SOFTWARE WITHOUT RESTRICTION, INCLUDING
+WITHOUT LIMITATION THE RIGHTS TO USE, COPY, MODIFY, MERGE, PUBLISH,
+DISTRIBUTE, SUBLICENSE, AND/OR SELL COPIES OF THE SOFTWARE, AND TO
+PERMIT PERSONS TO WHOM THE SOFTWARE IS FURNISHED TO DO SO, SUBJECT TO
+THE FOLLOWING CONDITIONS:
+
+THE ABOVE COPYRIGHT NOTICE, THIS PERMISSION NOTICE, AND THE LIMITATION
+OF LIABILITY BELOW SHALL BE INCLUDED IN ALL COPIES OR REDISTRIBUTIONS
+OF SUBSTANTIAL PORTIONS OF THE SOFTWARE.
+
+SOFTWARE IS BEING DEVELOPED IN PART AT THE NATIONAL HEART, LUNG, AND BLOOD
+INSTITUTE, NATIONAL INSTITUTES OF HEALTH BY AN EMPLOYEE OF THE FEDERAL
+GOVERNMENT IN THE COURSE OF HIS OFFICIAL DUTIES. PURSUANT TO TITLE 17, 
+SECTION 105 OF THE UNITED STATES CODE, THIS SOFTWARE IS NOT SUBJECT TO 
+COPYRIGHT PROTECTION AND IS IN THE PUBLIC DOMAIN. EXCEPT AS CONTAINED IN
+THIS NOTICE, THE NAME OF THE AUTHORS, THE NATIONAL HEART, LUNG, AND BLOOD
+INSTITUTE (NHLBI), OR THE NATIONAL INSTITUTES OF HEALTH (NIH) MAY NOT 
+BE USED TO ENDORSE OR PROMOTE PRODUCTS DERIVED FROM THIS SOFTWARE WITHOUT 
+SPECIFIC PRIOR WRITTEN PERMISSION FROM THE NHLBI OR THE NIH.THE SOFTWARE IS 
+PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..860f423
--- /dev/null
+++ b/README
@@ -0,0 +1,19 @@
+GADGETRON IMAGE RECONSTRUCTION FRAMEWORK
+
+Please read LICENSE file for licensing details.
+
+Detailed installation instructions and manual is available at:
+
+http://gadgetron.sourceforge.net
+
+-------------------------------------
+General Building Instructions (on Unix platforms)
+
+mkdir build
+cd build
+cmake ../
+make
+sudo make install
+
+Please check manual for detailed instructions for your platform.
+
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
new file mode 100644
index 0000000..ca6900c
--- /dev/null
+++ b/apps/CMakeLists.txt
@@ -0,0 +1,7 @@
+IF (ACE_FOUND AND XSD_FOUND)
+  add_subdirectory(gadgetron)
+  IF (ISMRMRD_FOUND)
+    add_subdirectory(clients)
+  ENDIF (ISMRMRD_FOUND)	 
+ENDIF (ACE_FOUND AND XSD_FOUND)
+add_subdirectory(standalone)
diff --git a/apps/clients/CMakeLists.txt b/apps/clients/CMakeLists.txt
new file mode 100644
index 0000000..00bf884
--- /dev/null
+++ b/apps/clients/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(WIN32)
+  add_definitions(-DTIXML_USE_STL)
+endif(WIN32)
+ 
+if(ISMRMRD_FOUND AND HDF5_FOUND)
+  add_subdirectory(mriclient)
+endif(ISMRMRD_FOUND AND HDF5_FOUND)
diff --git a/apps/clients/mriclient/BlobFileWriter.h b/apps/clients/mriclient/BlobFileWriter.h
new file mode 100644
index 0000000..5c79a3f
--- /dev/null
+++ b/apps/clients/mriclient/BlobFileWriter.h
@@ -0,0 +1,91 @@
+#ifndef BLOB_FILE_WRITER_H
+#define BLOB_FILE_WRITER_H
+
+#include <fstream>
+#include <iomanip>
+
+#include "GadgetMessageInterface.h"
+
+namespace Gadgetron {
+
+#define MAX_BLOBS_LOG_10    6
+
+class BlobFileWriter : public GadgetMessageReader
+{
+
+    public:
+        BlobFileWriter(std::string fileprefix, std::string filesuffix)
+            : number_of_calls_(0)
+            , file_prefix(fileprefix)
+            , file_suffix(filesuffix)
+        {
+        }
+
+        virtual ~BlobFileWriter() {};
+
+        virtual ACE_Message_Block* read(ACE_SOCK_Stream* socket)
+        {
+            ssize_t recv_count = 0;
+
+            // MUST READ 32-bits
+            uint32_t nbytes;
+            if ((recv_count = socket->recv_n(&nbytes, sizeof(nbytes))) <= 0) {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, BlobFileWriter, failed to read Blob Header\n")) );
+                return 0;
+            }
+
+            char *data = new char[nbytes];
+            if ((recv_count = socket->recv_n(data, nbytes)) <= 0) {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, BlobFileWriter, failed to read blob from socket\n")) );
+                return 0;
+            }
+
+            if (this->process_image(nbytes, data) < 0) {
+                GADGET_DEBUG1("Failed to process image\n");
+                return 0;
+            }
+
+            delete[] data;
+
+            // The GadgetronConnector expects an ACE_Message_Block* (NOT NULL)
+            ACE_Message_Block *mb = new ACE_Message_Block();
+
+            return mb;
+        }
+
+        virtual int process_image(const unsigned int bytes, const char* data)
+        {
+            std::stringstream filename;
+
+            // Create the filename: (prefix_%06.suffix)
+            filename << file_prefix << "_";
+            filename << std::setfill('0') << std::setw(MAX_BLOBS_LOG_10) << number_of_calls_;
+            filename << "." << file_suffix;
+
+            std::ofstream outfile;
+            outfile.open (filename.str().c_str(), std::ios::out|std::ios::binary);
+
+            ACE_DEBUG( (LM_DEBUG, ACE_TEXT("Writing image %s\n"), filename.str().c_str()) );
+
+            if (outfile.good()) {
+                /* write 'size' bytes starting at 'data's pointer */
+                outfile.write(data, bytes);
+                outfile.close();
+                number_of_calls_++;
+            } else {
+                GADGET_DEBUG1("File is not good for writing\n");
+                return GADGET_FAIL;
+            }
+
+            return GADGET_OK;
+        }
+
+    protected:
+        size_t number_of_calls_;
+        std::string file_prefix;
+        std::string file_suffix;
+};
+
+} // namespace Gadgetron
+
+#endif //BLOB_FILE_WRITER_H
diff --git a/apps/clients/mriclient/CMakeLists.txt b/apps/clients/mriclient/CMakeLists.txt
new file mode 100644
index 0000000..0be2e20
--- /dev/null
+++ b/apps/clients/mriclient/CMakeLists.txt
@@ -0,0 +1,52 @@
+find_package(Ismrmrd REQUIRED)
+find_package(HDF5 1.8 COMPONENTS C CXX REQUIRED)
+
+set(Boost_NO_BOOST_CMAKE ON)
+
+if(WIN32)
+  find_package(Boost COMPONENTS thread system date_time chrono REQUIRED)
+else(WIN32)
+  find_package(Boost COMPONENTS thread system REQUIRED)
+endif(WIN32)
+
+if(WIN32)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(      
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+  ${CMAKE_SOURCE_DIR}/apps/gadgetron
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  ${ISMRMRD_SCHEMA_DIR}
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+  ${ACE_INCLUDE_DIR} 
+  ${Boost_INCLUDE_DIR} 
+  ${HDF5_CXX_INCLUDE_DIR} 
+  ${HDF5_C_INCLUDE_DIR} 
+  ${ISMRMRD_INCLUDE_DIR}
+  ${XSD_INCLUDE_DIR}
+  ${XERCESC_INCLUDE_DIR}
+  )
+
+add_executable(mriclient main.cpp)
+add_executable(gt_alive gt_alive.cpp)
+
+target_link_libraries(mriclient cpucore ${MKL_LIBRARIES})
+target_link_libraries(gt_alive cpucore gadgettools optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}  ${Boost_LIBRARIES} ${ISMRMRD_LIBRARIES} ${MKL_LIBRARIES})
+
+IF(WIN32)
+    target_link_libraries(mriclient optimized ${HDF5_hdf5_LIBRARY_RELEASE} ${HDF5_hdf5_cpp_LIBRARY_RELEASE})
+    target_link_libraries(mriclient debug ${HDF5_hdf5_LIBRARY_DEBUG} ${HDF5_hdf5_cpp_LIBRARY_DEBUG})
+
+    target_link_libraries(mriclient gadgetron_mricore gadgettools optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} ${ISMRMRD_LIBRARIES} ${Boost_LIBRARIES})
+ELSE (WIN32)
+    target_link_libraries(mriclient gadgettools  ${HDF5_LIBRARIES} optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} ${ISMRMRD_LIBRARIES} ${Boost_LIBRARIES})
+ENDIF(WIN32)
+
+install(TARGETS mriclient gt_alive DESTINATION bin)
+install(FILES ImageWriter.h HDF5ImageWriter.h BlobFileWriter.h DESTINATION include)
+install(FILES ${ISMRMRD_LIBRARIES} DESTINATION lib)
+install(FILES isalive.xml DESTINATION config)
diff --git a/apps/clients/mriclient/HDF5ImageWriter.h b/apps/clients/mriclient/HDF5ImageWriter.h
new file mode 100644
index 0000000..f3327de
--- /dev/null
+++ b/apps/clients/mriclient/HDF5ImageWriter.h
@@ -0,0 +1,76 @@
+/*
+ * HDF5ImageWriter.h
+ *
+ *  Created on: Jan 25, 2012
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef HDF5IMAGEWRITER_H_
+#define HDF5IMAGEWRITER_H_
+
+#include "ImageWriter.h"
+
+#include <ismrmrd_hdf5.h>
+#include <sstream>
+
+namespace Gadgetron{
+template <typename T> class HDF5ImageWriter : public ImageWriter<T>
+{
+
+public:
+	HDF5ImageWriter(std::string filename, std::string groupname)
+	: ImageWriter<T>()
+	, file_name_(filename)
+	, group_name_(groupname)
+	, dataset_(filename.c_str(), groupname.c_str())
+	{
+
+	}
+
+	virtual int process_image(ISMRMRD::ImageHeader* img_head,
+			hoNDArray< T >* data)
+	{
+		try {
+			ISMRMRD::HDF5Exclusive lock; //This will ensure threadsafe access to HDF5
+			std::stringstream st1;
+			st1 << "image_" << img_head->image_series_index << ".head";
+			std::string head_varname = st1.str();
+
+			std::stringstream st2;
+			st2 << "image_" << img_head->image_series_index << ".img";
+			std::string img_varname = st2.str();
+
+			if (dataset_.appendImageHeader(*img_head, head_varname.c_str()) < 0) {
+				GADGET_DEBUG1("Failed to write image header\n");
+				return GADGET_FAIL;
+			}
+
+            std::vector<size_t> dim = *data->get_dimensions();
+            std::vector<unsigned int> dim2(dim.size());
+
+            size_t ii;
+            for ( ii=0; ii<dim.size(); ii++ )
+            {
+                dim2[ii] = dim[ii];
+            }
+
+			if (dataset_.appendArray(dim2,data->get_data_ptr(), img_varname.c_str())  < 0) {
+				GADGET_DEBUG1("Failed to write image data\n");
+				return GADGET_FAIL;
+			};
+		} catch (...) {
+			GADGET_DEBUG1("Error attempting to append images to HDF5 file\n");
+			return GADGET_FAIL;
+		}
+
+		return GADGET_OK;
+	}
+
+protected:
+	std::string group_name_;
+	std::string file_name_;
+	ISMRMRD::IsmrmrdDataset dataset_;
+};
+
+}
+#endif /* HDF5IMAGEWRITER_H_ */
diff --git a/apps/clients/mriclient/ImageWriter.h b/apps/clients/mriclient/ImageWriter.h
new file mode 100644
index 0000000..8341e0f
--- /dev/null
+++ b/apps/clients/mriclient/ImageWriter.h
@@ -0,0 +1,113 @@
+#ifndef IMAGEWRITER_H
+#define IMAGEWRITER_H
+
+#include <fstream>
+
+#include "GadgetImageMessageReader.h"
+
+namespace Gadgetron
+{
+
+template <typename T> class ImageWriter : public GadgetImageMessageReader<T>
+{
+
+public:
+	ImageWriter()
+	: number_of_calls_(0)
+	{}
+
+	virtual ~ImageWriter() {};
+
+	virtual ACE_Message_Block* read(ACE_SOCK_Stream* socket) 
+	{
+		// Invoke parent's read
+		ACE_Message_Block* mb = GadgetImageMessageReader<T>::read(socket);
+
+		if (!mb) {
+			GADGET_DEBUG1("Read failed in parent\n");
+			return 0;
+		}
+
+		GadgetContainerMessage<ISMRMRD::ImageHeader> * img_head_mb =
+				dynamic_cast<GadgetContainerMessage<ISMRMRD::ImageHeader> *>(mb);
+
+		if (!img_head_mb) {
+			GADGET_DEBUG1("Failed in dynamic cast\n");
+			mb->release();
+			return 0;
+		}
+
+		//GADGET_DEBUG2("Received image with %d channels\n", img_head_mb->getObjectPtr()->channels);
+
+		GadgetContainerMessage<hoNDArray< T > > * img_data_mb =
+				dynamic_cast<GadgetContainerMessage<hoNDArray< T > > *>(img_head_mb->cont());
+
+		if (!img_data_mb) {
+			GADGET_DEBUG1("Failed in dynamic cast\n");
+			mb->release();
+			return 0;
+		}
+
+		if (this->process_image(img_head_mb->getObjectPtr(), img_data_mb->getObjectPtr()) < 0) {
+			GADGET_DEBUG1("Failed to process image\n");
+			mb->release();
+			return 0;
+		}
+
+		return mb;
+	}
+
+	virtual int process_image(ISMRMRD::ImageHeader* img_head,
+			hoNDArray< T >* data)
+	{
+		ACE_DEBUG( (LM_DEBUG, ACE_TEXT("Image Writer writing image\n")) );
+
+		char filename[1024];
+
+		switch (sizeof(T)) {
+
+		case (8): //Complex float
+    			sprintf(filename, "out_%05d.cplx", (int)number_of_calls_);
+		break;
+		case (4): //Real floats
+				sprintf(filename, "out_%05d.real", (int)number_of_calls_);
+		break;
+		case (2): //Unsigned short
+				sprintf(filename, "out_%05d.short", (int)number_of_calls_);
+		break;
+		default:
+			sprintf(filename, "out_%05d.cplx", (int)number_of_calls_);
+			break;
+		}
+
+		std::ofstream outfile;
+		outfile.open (filename, std::ios::out|std::ios::binary);
+
+		if (outfile.good()) {
+			int ndim = 4;
+			int dims[4];
+			size_t elements = 1;
+			dims[0] = img_head->matrix_size[0]; elements*=dims[0];
+			dims[1] = img_head->matrix_size[1]; elements*=dims[1];
+			dims[2] = img_head->matrix_size[2]; elements*=dims[2];
+			dims[3] = img_head->channels; elements*=dims[3];
+
+			outfile.write((char*)&ndim,sizeof(int));
+			outfile.write((char*)dims,sizeof(int)*4);
+			outfile.write((char*)data->get_data_ptr(),sizeof(T)*elements);
+			outfile.close();
+			number_of_calls_++;
+		} else {
+			GADGET_DEBUG1("File is not good for writing\n");
+			return GADGET_FAIL;
+		}
+
+		return GADGET_OK;
+	}
+
+protected:
+	size_t number_of_calls_;
+};
+
+}
+#endif //IMAGE_WRITER
diff --git a/apps/clients/mriclient/gt_alive.cpp b/apps/clients/mriclient/gt_alive.cpp
new file mode 100644
index 0000000..145980f
--- /dev/null
+++ b/apps/clients/mriclient/gt_alive.cpp
@@ -0,0 +1,61 @@
+#include "GadgetronConnector.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+#include "ImageWriter.h"
+#include "HDF5ImageWriter.h"
+#include "FileInfo.h"
+#include "ismrmrd_hdf5.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include <ace/Log_Msg.h>
+#include <ace/Get_Opt.h>
+#include <ace/OS_NS_string.h>
+
+#include <fstream>
+#include <time.h>
+#include <iomanip>
+
+using namespace Gadgetron;
+
+int ACE_TMAIN(int argc, ACE_TCHAR *argv[] )
+{
+	GadgetronConnector con;
+
+	std::string host("localhost");
+	std::string port("9002");
+
+	if (argc > 1) {
+		host = std::string(argv[1]);
+	}
+
+	if (argc > 2) {
+		port = std::string(argv[2]);
+	}
+
+	if (con.open(host,port) != 0) {
+		ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to connect to the Gadgetron host")));
+		return -1;
+	}
+
+	//Tell Gadgetron which XML configuration to run.
+	if (con.send_gadgetron_configuration_file(std::string("isalive.xml")) != 0) {
+		ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send XML configuration to the Gadgetron host")));
+		return -1;
+	}
+
+
+	GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+			new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+	m1->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+	if (con.putq(m1) == -1) {
+		ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to put CLOSE package on queue")));
+		return -1;
+	}
+
+	con.wait();
+
+	return 0;
+}
diff --git a/apps/clients/mriclient/isalive.xml b/apps/clients/mriclient/isalive.xml
new file mode 100644
index 0000000..27c33fc
--- /dev/null
+++ b/apps/clients/mriclient/isalive.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+	<!--        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetroncore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetroncore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetroncore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetroncore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetroncore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetroncore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+	-->
+</gadgetronStreamConfiguration>
diff --git a/apps/clients/mriclient/main.cpp b/apps/clients/mriclient/main.cpp
new file mode 100644
index 0000000..3d3b0c8
--- /dev/null
+++ b/apps/clients/mriclient/main.cpp
@@ -0,0 +1,230 @@
+#include "ace/Log_Msg.h"
+#include "ace/Get_Opt.h"
+#include "ace/OS_NS_string.h"
+
+#include "GadgetronConnector.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+#include "ImageWriter.h"
+#include "HDF5ImageWriter.h"
+#include "FileInfo.h"
+#include "ismrmrd_hdf5.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "BlobFileWriter.h"
+
+#include <fstream>
+#include <time.h>
+#include <iomanip>
+
+using namespace Gadgetron;
+void print_usage()
+{
+	ACE_DEBUG((LM_INFO, ACE_TEXT("Usage: \n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("mriclient -p <PORT>                      (default 9002)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -h <HOST>                      (default localhost)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -d <HDF5 DATA FILE>            (default ./data.h5)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -g <HDF5 DATA GROUP>           (default /dataset)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -c <GADGETRON CONFIG>          (default default.xml)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -l <LOOPS>                     (default 1)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -o <HDF5 OUT FILE>             (out.h5)\n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("          -G <HDF5 OUT GROUP>            (default date and time)\n") ));
+}
+
+
+std::string get_date_time_string()
+{
+	time_t rawtime;
+	struct tm * timeinfo;
+	time ( &rawtime );
+	timeinfo = localtime ( &rawtime );
+
+
+	std::stringstream str;
+	str << timeinfo->tm_year+1900 << "-"
+			<< std::setw(2) << std::setfill('0') << timeinfo->tm_mon+1
+			<< "-"
+			<< std::setw(2) << std::setfill('0') << timeinfo->tm_mday
+			<< " "
+			<< std::setw(2) << std::setfill('0') << timeinfo->tm_hour
+			<< ":"
+			<< std::setw(2) << std::setfill('0') << timeinfo->tm_min
+			<< ":"
+			<< std::setw(2) << std::setfill('0') << timeinfo->tm_sec;
+
+	std::string ret = str.str();
+
+	return ret;
+}
+
+int ACE_TMAIN(int argc, ACE_TCHAR *argv[] )
+{
+	static const ACE_TCHAR options[] = ACE_TEXT(":p:h:d:x:c:l:o:g:G:");
+
+	ACE_Get_Opt cmd_opts(argc, argv, options);
+
+	ACE_TCHAR port_no[1024];
+	ACE_OS_String::strncpy(port_no, "9002", 1024);
+
+	ACE_TCHAR hostname[1024];
+	ACE_OS_String::strncpy(hostname, "localhost", 1024);
+
+	ACE_TCHAR hdf5_in_data_file[4096];
+	ACE_OS_String::strncpy(hdf5_in_data_file, "./data.h5", 4096);
+
+	ACE_TCHAR hdf5_in_group[4096];
+	ACE_OS_String::strncpy(hdf5_in_group, "/dataset", 4096);
+
+	ACE_TCHAR config_file[1024];
+	ACE_OS_String::strncpy(config_file, "default.xml", 1024);
+
+	bool save_hdf5 = false;
+
+	ACE_TCHAR hdf5_out_file[1024];
+	ACE_OS_String::strncpy(hdf5_out_file, "./out.h5", 1024);
+
+	ACE_TCHAR hdf5_out_group[1024];
+
+	std::string date_time = get_date_time_string();
+
+	ACE_OS_String::strncpy(hdf5_out_group, date_time.c_str(), 1024);
+
+	int repetition_loops = 1;
+
+	int option;
+	while ((option = cmd_opts()) != EOF) {
+		switch (option) {
+		case 'p':
+			ACE_OS_String::strncpy(port_no, cmd_opts.opt_arg(), 1024);
+			break;
+		case 'h':
+			ACE_OS_String::strncpy(hostname, cmd_opts.opt_arg(), 1024);
+			break;
+		case 'd':
+			ACE_OS_String::strncpy(hdf5_in_data_file, cmd_opts.opt_arg(), 4096);
+			break;
+		case 'g':
+			ACE_OS_String::strncpy(hdf5_in_group, cmd_opts.opt_arg(), 4096);
+			break;
+		case 'c':
+			ACE_OS_String::strncpy(config_file, cmd_opts.opt_arg(), 1024);
+			break;
+		case 'l':
+			repetition_loops = ACE_OS::atoi(cmd_opts.opt_arg());
+			break;
+		case 'o':
+			ACE_OS_String::strncpy(hdf5_out_file, cmd_opts.opt_arg(), 1024);
+			break;
+		case 'G':
+			ACE_OS_String::strncpy(hdf5_out_group, cmd_opts.opt_arg(), 1024);
+			break;
+		case ':':
+			print_usage();
+			ACE_ERROR_RETURN((LM_ERROR, ACE_TEXT("-%c requires an argument.\n"), cmd_opts.opt_opt()),-1);
+			break;
+		default:
+			print_usage();
+			ACE_ERROR_RETURN( (LM_ERROR, ACE_TEXT("Command line parse error\n")), -1);
+			break;
+		}
+	}
+
+	ACE_DEBUG(( LM_INFO, ACE_TEXT("Gadgetron MRI Data Sender\n") ));
+
+	//Let's check if the files exist:
+	std::string hdf5_xml_varname = std::string(hdf5_in_group) + std::string("/xml");
+	std::string hdf5_data_varname = std::string(hdf5_in_group) + std::string("/data");
+
+	if (!FileInfo(std::string(hdf5_in_data_file)).exists()) {
+		ACE_DEBUG((LM_INFO, ACE_TEXT("Data file %s does not exist.\n"), hdf5_in_data_file));
+		print_usage();
+		return -1;
+	}
+
+	boost::shared_ptr<ISMRMRD::IsmrmrdDataset> ismrmrd_dataset(new ISMRMRD::IsmrmrdDataset(hdf5_in_data_file,hdf5_in_group));
+	boost::shared_ptr<std::string> xml_config = ismrmrd_dataset->readHeader();
+
+	if (repetition_loops < 1) {
+		ACE_DEBUG((LM_INFO, ACE_TEXT("Invalid number of repetition loops (%d).\n"), repetition_loops));
+		print_usage();
+		return -1;
+	}
+
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- host            :      %s\n"), hostname));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- port            :      %s\n"), port_no));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- hdf5 file  in   :      %s\n"), hdf5_in_data_file));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- hdf5 group in   :      %s\n"), hdf5_in_group));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- conf            :      %s\n"), config_file));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- loop            :      %d\n"), repetition_loops));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- hdf5 file out   :      %s\n"), hdf5_out_file));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("  -- hdf5 group out  :      %s\n"), hdf5_out_group));
+
+	for (int i = 0; i < repetition_loops; i++) {
+
+		GadgetronConnector con;
+
+		//con.register_writer(GADGET_MESSAGE_ACQUISITION, new GadgetAcquisitionMessageWriter());
+		con.register_writer(GADGET_MESSAGE_ISMRMRD_ACQUISITION, new GadgetIsmrmrdAcquisitionMessageWriter());
+		con.register_reader(GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT, new HDF5ImageWriter<ACE_UINT16>(std::string(hdf5_out_file), std::string(hdf5_out_group)));
+		con.register_reader(GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT, new HDF5ImageWriter<float>(std::string(hdf5_out_file), std::string(hdf5_out_group)));
+		con.register_reader(GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT, new HDF5ImageWriter< std::complex<float> >(std::string(hdf5_out_file), std::string(hdf5_out_group)));
+
+		con.register_reader(GADGET_MESSAGE_DICOM, new BlobFileWriter(std::string(hdf5_out_file), std::string("sdcopen")));
+
+		//Open a connection with the gadgetron
+		if (con.open(std::string(hostname),std::string(port_no)) != 0) {
+			ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to connect to the Gadgetron host")));
+			return -1;
+		}
+
+		//Tell Gadgetron which XML configuration to run.
+		if (con.send_gadgetron_configuration_file(std::string(config_file)) != 0) {
+			ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send XML configuration to the Gadgetron host")));
+			return -1;
+		}
+
+		if (con.send_gadgetron_parameters(*xml_config) != 0) {
+			ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send XML parameters to the Gadgetron host")));
+			return -1;
+		}
+
+		unsigned long acquisitions = ismrmrd_dataset->getNumberOfAcquisitions();//HDF5GetLengthOfFirstDimension(hdf5_in_data_file, hdf5_data_varname.c_str());
+
+		for (unsigned long int i = 0; i < acquisitions; i++) {
+			GadgetContainerMessage<ISMRMRD::Acquisition>* acq = new GadgetContainerMessage<ISMRMRD::Acquisition>();
+			{
+				ISMRMRD::HDF5Exclusive lock; //This will ensure thread-safe access to HDF5
+				boost::shared_ptr<ISMRMRD::Acquisition> acq_tmp = ismrmrd_dataset->readAcquisition(i);
+				*(acq->getObjectPtr()) = *acq_tmp; //We are copying the data into the container message
+
+			}
+
+
+			GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+					new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+			m1->getObjectPtr()->id = GADGET_MESSAGE_ISMRMRD_ACQUISITION;
+
+			m1->cont(acq);
+
+			if (con.putq(m1) == -1) {
+				ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to put data package on queue")));
+				return -1;
+			}
+		}
+
+		GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+				new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+		m1->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+		if (con.putq(m1) == -1) {
+			ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to put CLOSE package on queue")));
+			return -1;
+		}
+
+		con.wait();
+	}
+
+	return 0;
+}
diff --git a/apps/gadgetron/CMakeLists.txt b/apps/gadgetron/CMakeLists.txt
new file mode 100644
index 0000000..2f3381e
--- /dev/null
+++ b/apps/gadgetron/CMakeLists.txt
@@ -0,0 +1,60 @@
+IF (WIN32)
+ADD_DEFINITIONS(-DTIXML_USE_STL)
+ENDIF (WIN32)
+
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  ${ACE_INCLUDE_DIR} 
+  ${XSD_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/apps/gadgetron
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils)
+
+#Process the XSD files
+SET(XSDS schema/gadgetron.xsd)
+SET(XSD_ARGS cxx-tree --generate-serialization)
+WRAP_XSD(XSDS_SOURCES XSD_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}/schema ${XSDS} OPTIONS ${XSD_ARGS})
+INCLUDE_DIRECTORIES(${XSD_INCLUDES} ${XERCESC_INCLUDE_DIR})
+
+add_executable(gadgetron 
+  ${XSDS_SOURCES} 
+  main.cpp 
+  #GadgetStreamController.cpp 
+  #GadgetServerAcceptor.cpp 
+  EndGadget.h
+  Gadget.h
+  GadgetContainerMessage.h
+  GadgetMessageInterface.h
+  Gadgetron.h
+  GadgetronExport.h
+  #GadgetServerAcceptor.h
+  #GadgetStreamController.h 
+  )
+
+target_link_libraries(gadgetron 
+  gadgettools 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+  ${XERCESC_LIBRARIES}
+ )
+
+install(TARGETS gadgetron DESTINATION bin)
+
+install(FILES 	
+  GadgetContainerMessage.h
+  Gadget.h
+  #GadgetServerAcceptor.h
+  #GadgetStreamController.h
+  Gadgetron.h
+  GadgetMessageInterface.h
+  EndGadget.h
+  GadgetronExport.h
+  DESTINATION include) 
+
+install(FILES 	
+  gadgetron.xml.example
+  DESTINATION config)
+
+install(FILES 	
+  schema/gadgetron.xsd
+  DESTINATION schema)
diff --git a/apps/gadgetron/EndGadget.h b/apps/gadgetron/EndGadget.h
new file mode 100644
index 0000000..17d6f82
--- /dev/null
+++ b/apps/gadgetron/EndGadget.h
@@ -0,0 +1,57 @@
+/*
+ * EndGadget.h
+ *
+ *  Created on: Nov 3, 2011
+ *      Author: hansenms
+ */
+
+#ifndef ENDGADGET_H_
+#define ENDGADGET_H_
+
+#include "Gadget.h"
+#include "GadgetMessageInterface.h"
+
+namespace Gadgetron{
+class EndGadget : public Gadget
+{
+	virtual int close(unsigned long flags)
+	{
+		GADGET_DEBUG2("Close called in EndGadget with flags %d\n", flags);
+
+		GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+				new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+		mb->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+		if (controller_->output_ready(mb) < 0) {
+			return GADGET_FAIL;
+		}
+
+		GADGET_DEBUG2("Calling close in base class  with flags %d\n", flags);
+		return Gadget::close(flags);
+	}
+
+protected:
+	virtual int process(ACE_Message_Block *m)
+	{
+		ACE_TRACE(( ACE_TEXT("EndGadget::process(ACE_Message_Block* m)") ));
+		m->release();
+		return 0;
+	}
+
+	virtual int next_step(ACE_Message_Block *m)
+	{
+		ACE_TRACE(( ACE_TEXT("EndGadget::next_step(ACE_Message_Block *m)") ));
+		m->release();
+		return 0;
+	}
+
+	virtual int process_config(ACE_Message_Block * m) {
+		m->release();
+		return 0;
+	}
+
+};
+}
+
+#endif /* ENDGADGET_H_ */
diff --git a/apps/gadgetron/Gadget.h b/apps/gadgetron/Gadget.h
new file mode 100644
index 0000000..acbd028
--- /dev/null
+++ b/apps/gadgetron/Gadget.h
@@ -0,0 +1,382 @@
+#ifndef GADGET_H
+#define GADGET_H
+#pragma once
+
+#include <ace/OS_NS_stdlib.h>
+#include <ace/Task.h>
+#include <ace/Stream.h>
+#include <ace/Module.h>
+#include <ace/OS_Memory.h>
+#include <ace/Svc_Handler.h>
+#include <ace/SOCK_Stream.h>
+
+#include <map>
+#include <string>
+#include <boost/shared_ptr.hpp>
+
+#include "GadgetContainerMessage.h"
+#include "GadgetronExport.h"
+#include "Gadgetron.h"
+#include <stdexcept>
+
+namespace Gadgetron{
+
+    class GadgetStreamController;
+
+    class Gadget : public ACE_Task<ACE_MT_SYNCH>
+    {
+
+    public:
+        typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+        enum
+        {
+            GADGET_MESSAGE_CONFIG = (ACE_Message_Block::USER_FLAGS << 1)
+        };
+
+        Gadget()
+            : inherited()
+            , desired_threads_(1)
+            , pass_on_undesired_data_(false)
+            , controller_(0)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::Gadget") ));
+        }
+
+        virtual ~Gadget()
+        {
+            GADGET_DEBUG2("Shutting down Gadget (%s)\n", this->module()->name());
+        }
+
+
+        virtual int init(void)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::init") ));
+            return 0;
+        }
+
+        virtual int open(void* = 0)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::open") ));
+
+            int t = this->get_int_value("threads");
+            if (t > 0) {
+                GADGET_DEBUG2("Setting number of threads of gadget %s to %d\n", this->module()->name(), t);
+                this->desired_threads(t);
+            }
+
+            return this->activate( THR_NEW_LWP | THR_JOINABLE,
+                this->desired_threads() );
+        }
+
+        int put(ACE_Message_Block *m, ACE_Time_Value* timeout = 0)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::put") ));
+
+            return this->putq(m, timeout);
+        }
+
+        virtual unsigned int desired_threads()
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::desired_threads (get)") ));
+
+            return desired_threads_;
+        }
+
+        virtual void desired_threads(unsigned int t)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::desired_threads (set)") ));
+
+            desired_threads_ = t;
+        }
+
+        virtual void set_controller(GadgetStreamController* controller) {
+            controller_ = controller;
+        }
+
+        virtual GadgetStreamController* get_controller()
+        {
+            return controller_;
+        }
+
+        virtual int close(unsigned long flags)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::close") ));
+            GADGET_DEBUG2("Gadget (%s) Close Called with flags = %d\n", this->module()->name(), flags);
+            int rval = 0;
+            if (flags == 1) {
+                ACE_Message_Block *hangup = new ACE_Message_Block();
+                hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+                if (this->putq(hangup) == -1) {
+                    hangup->release();
+                    GADGET_DEBUG2("Gadget (%s) failed to put hang up message on queue\n", this->module()->name());
+                    return GADGET_FAIL;
+                }
+                GADGET_DEBUG2("Gadget (%s) waiting for thread to finish\n", this->module()->name());
+                rval = this->wait();
+                GADGET_DEBUG2("Gadget (%s) thread finished\n", this->module()->name());
+                controller_ = 0;
+            }
+            return rval;
+        }
+
+        virtual int svc(void)
+        {
+            ACE_TRACE(( ACE_TEXT("Gadget::svc") ));
+
+            for (ACE_Message_Block *m = 0; ;) {
+
+                //GADGET_DEBUG2("Waiting for message in Gadget (%s)\n", this->module()->name());
+                if (this->getq(m) == -1) {
+                    GADGET_DEBUG2("Gadget (%s) failed to get message from queue\n", this->module()->name());
+                    return GADGET_FAIL;
+                }
+                //GADGET_DEBUG2("Message Received in Gadget (%s)\n", this->module()->name());
+
+                //If this is a hangup message, we are done, put the message back on the queue before breaking
+                if (m->msg_type() == ACE_Message_Block::MB_HANGUP) {
+                    //GADGET_DEBUG2("Gadget (%s) Hangup message encountered\n", this->module()->name());
+                    if (this->putq(m) == -1) {
+                        GADGET_DEBUG2("Gadget (%s) failed to put hang up message on queue (for other threads)\n", this->module()->name());
+                        return GADGET_FAIL;
+                    }
+                    //GADGET_DEBUG2("Gadget (%s) breaking loop\n", this->module()->name());
+                    break;
+                }
+
+
+                //Is this config info, if so call appropriate process function
+                if (m->flags() & GADGET_MESSAGE_CONFIG) {
+
+                    int success;
+                    try{ success = this->process_config(m); }
+                    catch (std::runtime_error& err){
+                        GADGET_DEBUG_EXCEPTION(err,"Gadget::process_config() failed\n");
+                        success = -1;
+                    }
+
+                    if (success == -1) {
+                        m->release();
+                        this->flush();
+                        GADGET_DEBUG2("Gadget (%s) process config failed\n", this->module()->name());
+                        return GADGET_FAIL;
+
+                    }
+
+                    //Push this onto next gadgets queue, other gadgets may need this configuration information
+                    if (this->next()) {
+                        if (this->next()->putq(m) == -1) {
+                            m->release();
+                            GADGET_DEBUG2("Gadget (%s) process config failed to put config on dowstream gadget\n", this->module()->name());
+                            return GADGET_FAIL;
+                        }
+                    }
+                    continue;
+                }
+
+                int success;
+                try{ success = this->process(m); }
+                catch (std::runtime_error& err){
+                    GADGET_DEBUG_EXCEPTION(err,"Gadget::process() failed\n");
+                    success = -1;
+                }
+
+                if (success == -1) {
+                    m->release();
+                    this->flush();
+                    GADGET_DEBUG2("Gadget (%s) process failed\n", this->module()->name());
+                    return GADGET_FAIL;
+                }
+            }
+            return 0;
+        }
+
+        int set_parameter(const char* name, const char* val, bool trigger = true) {
+            boost::shared_ptr<std::string> old_value = get_string_value(name);
+
+            parameters_[std::string(name)] = std::string(val);
+
+            if (trigger) {
+                return parameter_changed(std::string(name), std::string(val), *old_value);
+            }
+
+            return 0;
+        }
+
+        int get_bool_value(const char* name) {
+            return (0 == ACE_OS::strcmp(get_string_value(name)->c_str(), "true"));
+        }
+
+        int get_int_value(const char* name) {
+            return ACE_OS::atoi(get_string_value(name)->c_str());
+        }
+
+        double get_double_value(const char* name) {
+            return ACE_OS::atof(get_string_value(name)->c_str());
+        }
+
+        boost::shared_ptr<std::string> get_string_value(const char* name) {
+            std::map<std::string,std::string>::iterator it;
+
+            it = parameters_.find(std::string(name));
+
+            if (it != parameters_.end()) {
+                return boost::shared_ptr<std::string>(new std::string(it->second));
+            }
+
+            return boost::shared_ptr<std::string>(new std::string(""));
+        }
+
+        /**
+        *  This trigger function is called whenever set_parameter is called with the trigger = true;
+        */
+        virtual int parameter_changed(std::string name, std::string new_value, std::string old_value)
+        {
+            return GADGET_OK;
+        }
+
+    protected:
+        virtual int next_step(ACE_Message_Block *m)
+        {
+            return this->put_next(m);//next()->putq(m);
+        }
+
+        virtual int process(ACE_Message_Block * m) = 0;
+
+        virtual int process_config(ACE_Message_Block * m) {
+            return 0;
+        }
+
+        unsigned int desired_threads_;
+        bool pass_on_undesired_data_;
+        GadgetStreamController* controller_;
+
+    private:
+        std::map<std::string, std::string> parameters_;
+    };
+
+    template <class P1> class Gadget1 : public Gadget
+    {
+
+    protected:
+        int process(ACE_Message_Block* mb)
+        {
+            GadgetContainerMessage<P1>* m = AsContainerMessage<P1>(mb);
+
+            if (!m) {
+                if (!pass_on_undesired_data_) {
+                    ACE_ERROR_RETURN(( LM_ERROR, ACE_TEXT("%p\n"),
+                        ACE_TEXT("Gadget1::process, conversion of message block")),
+                        -1);
+                } else {
+                    return (this->next()->putq(mb));
+                }
+
+            }
+
+            return this->process(m);
+        }
+
+        virtual int process(GadgetContainerMessage<P1>* m) = 0;
+
+    };
+
+    template <class P1, class P2> class Gadget2 : public Gadget
+    {
+
+    protected:
+        int process(ACE_Message_Block* mb)
+        {
+
+            GadgetContainerMessage<P1>* m1 = AsContainerMessage<P1>(mb);
+
+            GadgetContainerMessage<P2>* m2 = 0;
+            if (m1) {
+                m2 = AsContainerMessage<P2>(m1->cont());
+            }
+
+            if (!m1 || !m2) {
+                if (!pass_on_undesired_data_) {
+                    ACE_DEBUG( (LM_ERROR, ACE_TEXT("%s -> %s, (%s, %s, %@, %@), (%s, %s, %@, %@)\n"),
+                        this->module()->name(),
+                        ACE_TEXT("Gadget2::process, Conversion of Message Block Failed"),
+                        typeid(GadgetContainerMessage<P1>*).name(),
+                        typeid(m1).name(),
+                        mb,
+                        m1,
+                        typeid(GadgetContainerMessage<P2>*).name(),
+                        typeid(m2).name(),
+                        mb->cont(),
+                        m2));
+
+                    return -1;
+                } else {
+                    return (this->next()->putq(mb));
+                }
+            }
+
+            return this->process(m1,m2);
+        }
+
+        virtual int process(GadgetContainerMessage<P1>* m1, GadgetContainerMessage<P2>* m2) = 0;
+
+    };
+
+
+    template <class P1, class P2, class P3> class Gadget3 : public Gadget
+    {
+
+    protected:
+        int process(ACE_Message_Block* mb)
+        {
+
+            GadgetContainerMessage<P1>* m1 = AsContainerMessage<P1>(mb);
+
+            GadgetContainerMessage<P2>* m2 = 0;
+            if (m1) {
+                m2 = AsContainerMessage<P2>(m1->cont());
+            }
+
+            GadgetContainerMessage<P3>* m3 = 0;
+            if (m2) {
+                m3 = AsContainerMessage<P3>(m2->cont());
+            }
+
+            if (!m1 || !m2 || !m3) {
+                if (!pass_on_undesired_data_) {
+                    ACE_DEBUG( (LM_ERROR, ACE_TEXT("%s -> %s, (%s, %s, %@), (%s, %s, %@), (%s, %s, %@)\n"),
+                        this->module()->name(),
+                        ACE_TEXT("Gadget3::process, Conversion of Message Block Failed"),
+                        typeid(GadgetContainerMessage<P1>*).name(),
+                        typeid(m1).name(),
+                        m1,
+                        typeid(GadgetContainerMessage<P2>*).name(),
+                        typeid(m2).name(),
+                        m2,
+                        typeid(GadgetContainerMessage<P3>*).name(),
+                        typeid(m3).name(),
+                        m3));
+
+                    return -1;
+                } else {
+                    return (this->next()->putq(mb));
+                }
+            }
+
+            return this->process(m1,m2,m3);
+        }
+
+        virtual int process(GadgetContainerMessage<P1>* m1, GadgetContainerMessage<P2>* m2, GadgetContainerMessage<P3>* m3) = 0;
+
+    };
+
+
+    /* Macros for handling dyamic linking */
+#define GADGET_DECLARE(GADGET)			\
+    GADGETRON_LOADABLE_DECLARE(GADGET)
+
+#define GADGET_FACTORY_DECLARE(GADGET)			\
+    GADGETRON_LOADABLE_FACTORY_DECLARE(Gadget,GADGET)
+}
+
+#endif //GADGET_H
diff --git a/apps/gadgetron/GadgetContainerMessage.h b/apps/gadgetron/GadgetContainerMessage.h
new file mode 100644
index 0000000..2cef40e
--- /dev/null
+++ b/apps/gadgetron/GadgetContainerMessage.h
@@ -0,0 +1,118 @@
+#ifndef GADGETCONTAINERMESSAGE_H
+#define GADGETCONTAINERMESSAGE_H
+#pragma once
+
+#include <ace/Message_Block.h>
+#include <string>
+
+namespace Gadgetron{
+/**
+   The purpose of this case is to provide a type indepent interface to all ContainerMessages
+
+   This interface is able to set a magic number for each type which is later on used
+   instead of RTTI to "safely" cast to the right GadgetContainerMessage type
+
+ */
+class GadgetContainerMessageBase : public ACE_Message_Block
+{
+  typedef ACE_Message_Block base;
+  
+ public:
+
+  enum { CONTAINER_MESSAGE_BLOCK = (ACE_Message_Block::USER_FLAGS << 2) };
+
+  GadgetContainerMessageBase(size_t size) : base(size)
+  {
+    set_flags(CONTAINER_MESSAGE_BLOCK); //Mark this message block as a container, so that we know it is safe to type cast it.
+  }
+
+#ifdef WIN32
+  std::string getTypeID() { return type_magic_id_; }
+  template <class T> static std::string magic_number_for_type() { return std::string(typeid(T).name()); } 
+
+protected:
+  std::string type_magic_id_;
+
+#else
+
+  int getTypeID() { return type_magic_id_; }
+
+  template <class T> static int magic_number_for_type(){
+    //Will only get set once for each instanciation of this function
+    static int result(next_magic_type_number()); 
+    return result;
+  }
+
+ protected:
+  int type_magic_id_;
+
+  //Utility function for increting the magic number for types.
+  static int next_magic_type_number()
+  {
+    static int magic(0);
+    return magic++;
+  }	 
+#endif  
+};
+
+template <class T> class GadgetContainerMessage : public GadgetContainerMessageBase
+{
+  typedef GadgetContainerMessageBase base;
+
+public:
+  GadgetContainerMessage()
+    : base(sizeof(T))
+    , content_(0)
+  {
+    //Using placement new to put the new object at the ACE_Message_Block location
+    content_ = new (this->wr_ptr()) T; 
+
+    //Advance the write pointer appropriately.
+    this->wr_ptr(sizeof(T));
+
+    //Assign type ID that will allow us to safely cast this message.
+    type_magic_id_ = magic_number_for_type<T>(); 
+  }
+
+  virtual ~GadgetContainerMessage() 
+  {
+    //In case the object contained in this object has allocated memory on the heap, it must be destroyed
+    if (content_) content_->~T();
+
+    //ACE_Message_Block will take care of deallocating space for the object itself;
+  }
+
+  T* getObjectPtr() 
+  {
+    return content_;
+  }
+
+protected:
+  T* content_;
+}; 
+
+/**
+   This function replaces the slower dynamic_cast which we would otherwise rely on.
+   The speed of dynamic_cast varies greatly from platform to platform.
+
+   This function is less safe since it assumes casting to ContainerMessageBase is OK
+   when a certain flag is set on the ACE_Message_Block. If some user decides to use that flag
+   for other purposes, it could cause major problems that are hard to debug.
+
+   TODO: Find a more elegant solution for this.
+*/
+template <class T> GadgetContainerMessage<T>* AsContainerMessage(ACE_Message_Block* mb)
+{
+  if (!mb || !(mb->flags() & GadgetContainerMessageBase::CONTAINER_MESSAGE_BLOCK)) {
+    return 0;
+  }
+
+  GadgetContainerMessageBase* mbb = reinterpret_cast<GadgetContainerMessageBase*>(mb);
+  if (mbb->getTypeID() != GadgetContainerMessageBase::magic_number_for_type<T>()) {
+    return 0;
+  }
+
+  return reinterpret_cast<GadgetContainerMessage<T>* >(mbb);
+}
+}
+#endif  //GADGETCONTAINERMESSAGE_H
diff --git a/apps/gadgetron/GadgetMessageInterface.h b/apps/gadgetron/GadgetMessageInterface.h
new file mode 100644
index 0000000..39c8276
--- /dev/null
+++ b/apps/gadgetron/GadgetMessageInterface.h
@@ -0,0 +1,235 @@
+#ifndef GADGETMESSAGEINTERFACE_H
+#define GADGETMESSAGEINTERFACE_H
+
+#include "ace/SOCK_Stream.h"
+#include <ace/Basic_Types.h>
+
+#include <map>
+
+#include "GadgetContainerMessage.h"
+#include "Gadgetron.h"
+#include "GadgetronExport.h"
+#include "Gadget.h"
+
+namespace Gadgetron
+{
+
+enum GadgetronMessageID {
+  GADGET_MESSAGE_INT_ID_MIN       =   0,
+  GADGET_MESSAGE_CONFIG_FILE      =   1,
+  GADGET_MESSAGE_CONFIG_SCRIPT    =   2,
+  GADGET_MESSAGE_PARAMETER_SCRIPT =   3,
+  GADGET_MESSAGE_CLOSE            =   4,
+  GADGET_MESSAGE_INT_ID_MAX       = 999
+};
+
+struct GadgetMessageIdentifier
+{
+  ACE_UINT16 id;
+};
+
+struct GadgetMessageConfigurationFile
+{
+  char configuration_file[1024];
+};
+
+struct GadgetMessageScript
+{
+  ACE_UINT32 script_length;
+};
+
+
+/**
+   Interface for classes capable of reading a specific message
+
+   This is an abstract class, implementations need to be done for each message type.
+ */
+class GadgetMessageReader
+{
+ public:
+	virtual ~GadgetMessageReader() {}
+
+  /**
+     Function must be implemented to read a specific message.
+   */
+  virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) = 0;
+
+};
+
+/**
+   Interface for classes capable of writing for writing a specific message to a socket. 
+   This is an abstract class, implementations need to be done for each message type.
+ */
+class GadgetMessageWriter
+{
+ public:
+	virtual ~GadgetMessageWriter() {}
+
+   /**
+     Function must be implemented to write a specific message.
+   */
+  virtual int write(ACE_SOCK_Stream* stream, ACE_Message_Block* mb) = 0;
+};
+
+class GadgetMessageWriterContainer
+{
+ public:
+  virtual ~GadgetMessageWriterContainer() {
+    clear();
+  }
+
+
+  GadgetMessageWriter* find(ACE_UINT16 slot) {
+    std::map< ACE_UINT16, GadgetMessageWriter* >::iterator it;
+
+    it = map_.find(slot);
+    GadgetMessageWriter* ret = 0;
+    if (it != map_.end()) {
+      ret = it->second;
+    }
+    return ret;
+  }
+
+  int insert ( unsigned short slot, GadgetMessageWriter* dispatcher) {
+    std::map< ACE_UINT16, GadgetMessageWriter* >::iterator it;
+
+    it = map_.find(slot);
+    if (it != map_.end()) {
+      delete it->second;
+      it->second = dispatcher;
+    } else {
+      map_[slot] = dispatcher;
+    }
+    return GADGET_OK;
+  }
+
+  int clear()
+  {
+    std::map< ACE_UINT16, GadgetMessageWriter* >::iterator it;
+    for (it = map_.begin(); it != map_.end(); it++) {
+      delete it->second;
+     }
+    map_.clear();
+    return 0;
+  }
+
+ protected:
+  std::map<ACE_UINT16, GadgetMessageWriter*> map_;
+};
+
+
+class GadgetMessageReaderContainer
+{
+ public:
+  virtual ~GadgetMessageReaderContainer() {
+    clear();
+  }
+
+
+  GadgetMessageReader* find(ACE_UINT16 slot) {
+    std::map< ACE_UINT16, GadgetMessageReader* >::iterator it;
+
+    it = map_.find(slot);
+    GadgetMessageReader* ret = 0;
+    if (it != map_.end()) {
+      ret = it->second;
+    }
+    return ret;
+  }
+
+  int insert ( unsigned short slot, GadgetMessageReader* dispatcher) {
+    std::map< ACE_UINT16, GadgetMessageReader* >::iterator it;
+
+    it = map_.find(slot);
+    if (it != map_.end()) {
+      delete it->second;
+      it->second = dispatcher;
+    } else {
+      map_[slot] = dispatcher;
+    }
+    return GADGET_OK;
+  }
+
+  int clear()
+  {
+    std::map< ACE_UINT16, GadgetMessageReader* >::iterator it;
+
+    for (it = map_.begin(); it != map_.end(); it++) {
+      delete it->second;
+     }
+    map_.clear();
+    return 0;
+  }
+ protected:
+  std::map<ACE_UINT16, GadgetMessageReader*> map_;
+};
+
+class GadgetMessageConfigFileReader : public GadgetMessageReader
+{
+ public:
+  virtual ACE_Message_Block* read(ACE_SOCK_STREAM* stream) {
+
+    GadgetContainerMessage<GadgetMessageConfigurationFile>* mb1 =
+      new GadgetContainerMessage<GadgetMessageConfigurationFile>();
+    
+    if (!mb1) {
+      GADGET_DEBUG1("Unable to allocate GadgetMessageConfigurationFile\n");
+      return 0;
+    }
+
+    ssize_t recv_cnt = 0;
+    if ((recv_cnt = stream->recv_n (mb1->getObjectPtr(), sizeof(GadgetMessageConfigurationFile))) <= 0) {
+      GADGET_DEBUG1("Unable to read configuration file information\n");
+      mb1->release();
+      return 0;
+    }
+
+    return mb1;
+  }
+};
+
+
+class GadgetMessageScriptReader : public GadgetMessageReader
+{
+ public:
+  virtual ACE_Message_Block* read(ACE_SOCK_STREAM* stream) {
+
+    GadgetMessageScript ms;
+
+    ssize_t recv_cnt = 0;
+    if ((recv_cnt = stream->recv_n (&ms, sizeof(GadgetMessageScript))) <= 0) {
+      GADGET_DEBUG1("Unable to read configuration file information\n");
+       return 0;
+    }
+    
+    ACE_Message_Block* mb = new ACE_Message_Block(ms.script_length);
+
+    if ((recv_cnt = stream->recv_n (mb->wr_ptr(), ms.script_length)) <= 0) {
+      ACE_DEBUG ((LM_ERROR,
+		ACE_TEXT ("(%P|%t) Unable to read script\n")));
+      return 0;
+    }
+    mb->wr_ptr(ms.script_length);
+    mb->set_flags(Gadget::GADGET_MESSAGE_CONFIG);
+
+    return mb;
+  }
+};
+
+/* Macros for handling dyamic linking */
+
+#define GADGETRON_READER_DECLARE(READER) \
+  GADGETRON_LOADABLE_DECLARE(READER)
+
+#define GADGETRON_READER_FACTORY_DECLARE(READER)	\
+  GADGETRON_LOADABLE_FACTORY_DECLARE(GadgetMessageReader, READER)
+
+#define GADGETRON_WRITER_DECLARE(WRITER) \
+  GADGETRON_LOADABLE_DECLARE(WRITER)
+
+#define GADGETRON_WRITER_FACTORY_DECLARE(WRITER)	\
+  GADGETRON_LOADABLE_FACTORY_DECLARE(GadgetMessageWriter, WRITER)
+
+}
+
+#endif //GADGETMESSAGEINTERFACE_H
diff --git a/apps/gadgetron/GadgetServerAcceptor.cpp b/apps/gadgetron/GadgetServerAcceptor.cpp
new file mode 100644
index 0000000..48270b9
--- /dev/null
+++ b/apps/gadgetron/GadgetServerAcceptor.cpp
@@ -0,0 +1,58 @@
+#include "GadgetServerAcceptor.h"
+#include "GadgetStreamController.h"
+
+using namespace Gadgetron;
+
+GadgetServerAcceptor::~GadgetServerAcceptor ()
+{
+  this->handle_close (ACE_INVALID_HANDLE, 0);
+}
+
+int GadgetServerAcceptor::open (const ACE_INET_Addr &listen_addr)
+{
+  if (this->acceptor_.open (listen_addr, 1) == -1)
+    ACE_ERROR_RETURN ((LM_ERROR,
+                       ACE_TEXT ("%p\n"),
+                       ACE_TEXT ("acceptor.open")),
+                      -1);
+  return this->reactor ()->register_handler
+    (this, ACE_Event_Handler::ACCEPT_MASK);
+}
+
+
+
+
+int GadgetServerAcceptor::handle_input (ACE_HANDLE)
+{
+  GadgetStreamController *controller;
+  ACE_NEW_RETURN (controller, GadgetStreamController, -1);
+  auto_ptr<GadgetStreamController> p (controller);
+
+  if (this->acceptor_.accept (controller->peer ()) == -1)
+    ACE_ERROR_RETURN ((LM_ERROR,
+                       ACE_TEXT ("(%P|%t) %p\n"),
+                       ACE_TEXT ("Failed to accept ")
+                       ACE_TEXT ("controller connection")),
+                      -1);
+  p.release ();
+  controller->reactor (this->reactor ());
+  if (controller->open () == -1)
+    controller->handle_close (ACE_INVALID_HANDLE, 0);
+  return 0;
+}
+
+int GadgetServerAcceptor::handle_close (ACE_HANDLE, ACE_Reactor_Mask)
+{
+  ACE_DEBUG( (LM_DEBUG, 
+	      ACE_TEXT("GadgetServerAcceptor::handle_close")) );
+  
+  GADGET_DEBUG1("Close Data Acceptor\n");
+
+  if (this->acceptor_.get_handle () != ACE_INVALID_HANDLE) {
+    ACE_Reactor_Mask m = 
+      ACE_Event_Handler::ACCEPT_MASK | ACE_Event_Handler::DONT_CALL;
+    this->reactor ()->remove_handler (this, m);
+    this->acceptor_.close ();
+  }
+  return 0;
+}
diff --git a/apps/gadgetron/GadgetServerAcceptor.h b/apps/gadgetron/GadgetServerAcceptor.h
new file mode 100644
index 0000000..4b74ea5
--- /dev/null
+++ b/apps/gadgetron/GadgetServerAcceptor.h
@@ -0,0 +1,26 @@
+#ifndef _GADGETSERVERACCEPTOR_H
+#define _GADGETSERVERACCEPTOR_H
+
+#include "ace/SOCK_Acceptor.h"
+#include "ace/Reactor.h"
+
+namespace Gadgetron{
+class GadgetServerAcceptor : public ACE_Event_Handler
+{
+public:
+  virtual ~GadgetServerAcceptor ();
+
+  int open (const ACE_INET_Addr &listen_addr);
+
+  virtual ACE_HANDLE get_handle (void) const
+    { return this->acceptor_.get_handle (); }
+
+  virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+
+  virtual int handle_close (ACE_HANDLE handle,
+                            ACE_Reactor_Mask close_mask);
+protected:
+  ACE_SOCK_Acceptor acceptor_;
+};
+}
+#endif //_GADGETSERVERACCEPTOR_H
diff --git a/apps/gadgetron/GadgetStreamController.cpp b/apps/gadgetron/GadgetStreamController.cpp
new file mode 100644
index 0000000..f194b45
--- /dev/null
+++ b/apps/gadgetron/GadgetStreamController.cpp
@@ -0,0 +1,459 @@
+#include "ace/OS_NS_stdlib.h"
+#include "ace/OS_NS_string.h"
+#include "ace/OS_NS_stdio.h"
+#include "ace/DLL.h"
+#include "ace/DLL_Manager.h"
+#include "ace/OS_NS_netdb.h"
+
+#include "GadgetStreamController.h"
+#include "GadgetContainerMessage.h"
+#include "Gadget.h"
+#include "EndGadget.h"
+
+#include "gadgetron.hxx" //Auto generated class representation of gadgetron XML configuration
+#include "url_encode.h"
+
+#include <complex>
+#include <fstream>
+
+using namespace Gadgetron;
+int GadgetStreamController::open (void)
+{
+	//We will set up the controllers message queue such that when a packet is enqueued write will be triggered.
+	this->notifier_.reactor (this->reactor ());
+	this->msg_queue ()->notification_strategy (&this->notifier_);
+
+	ACE_TCHAR peer_name[MAXHOSTNAMELEN];
+	ACE_INET_Addr peer_addr;
+	if (peer().get_remote_addr (peer_addr) == 0 &&
+			peer_addr.addr_to_string (peer_name, MAXHOSTNAMELEN) == 0)
+		ACE_DEBUG ((LM_DEBUG,
+				ACE_TEXT ("(%P|%t) Connection from %s\n"),
+				peer_name));
+
+	//We have to have these basic types to be able to receive configuration file for stream
+	readers_.insert(GADGET_MESSAGE_CONFIG_FILE,
+			new GadgetMessageConfigFileReader());
+
+	readers_.insert(GADGET_MESSAGE_CONFIG_SCRIPT,
+			new GadgetMessageScriptReader());
+
+	readers_.insert(GADGET_MESSAGE_PARAMETER_SCRIPT,
+			new GadgetMessageScriptReader());
+
+	GadgetModule *head = 0;
+	GadgetModule *tail = 0;
+
+	if (tail == 0) {
+		Gadget* eg = new EndGadget();
+		if (eg) {
+			eg->set_controller(this);
+		}
+
+		ACE_NEW_RETURN(tail,
+				ACE_Module<ACE_MT_SYNCH>( ACE_TEXT("EndGadget"),
+						eg ),
+						-1);
+
+		stream_.open(0,head,tail);
+	}
+
+	this->writer_task_.open();
+
+	return this->reactor ()->register_handler(this,
+			ACE_Event_Handler::READ_MASK);// | ACE_Event_Handler::WRITE_MASK);
+}
+
+
+int GadgetStreamController::handle_input (ACE_HANDLE)
+{
+	//Reading sequence:
+	GadgetMessageIdentifier id;
+	ssize_t recv_cnt = 0;
+	if ((recv_cnt = peer().recv_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+		ACE_DEBUG ((LM_DEBUG,
+				ACE_TEXT ("(%P|%t) GadgetStreamController, unable to read message identifier\n")));
+		return -1;
+	}
+
+	if (id.id == GADGET_MESSAGE_CLOSE) {
+		GADGET_DEBUG1("Received close signal from client. Closing stream...\n");
+		stream_.close(1); //Shutdown gadgets and wait for them
+		GADGET_DEBUG1("Stream closed\n");
+		GADGET_DEBUG1("Closing writer task\n");
+		this->writer_task_.close(1);
+		GADGET_DEBUG1("Writer task closed\n");
+		return 0;
+	}
+
+	GadgetMessageReader* r = readers_.find(id.id);
+
+	if (!r) {
+		GADGET_DEBUG2("Unrecognized Message ID received: %d\n", id.id);
+		return GADGET_FAIL;
+	}
+
+	ACE_Message_Block* mb = r->read(&peer());
+
+	if (!mb) {
+		GADGET_DEBUG1("GadgetMessageReader returned null pointer\n");
+		return GADGET_FAIL;
+	}
+
+	//We need to handle some special cases to make sure that we can get a stream set up.
+	if (id.id == GADGET_MESSAGE_CONFIG_FILE) {
+		GadgetContainerMessage<GadgetMessageConfigurationFile>* cfgm =
+				AsContainerMessage<GadgetMessageConfigurationFile>(mb);
+
+		if (!cfgm) {
+			GADGET_DEBUG1("Failed to cast message block to configuration file\n");
+			mb->release();
+			return GADGET_FAIL;
+		} else {
+			if (this->configure_from_file(std::string(cfgm->getObjectPtr()->configuration_file)) != GADGET_OK) {
+				GADGET_DEBUG1("GadgetStream configuration failed\n");
+				mb->release();
+				return GADGET_FAIL;
+			} else {
+				mb->release();
+				return GADGET_OK;
+			}
+		}
+	} else if (id.id == GADGET_MESSAGE_CONFIG_SCRIPT) {
+		std::string xml_config(mb->rd_ptr(), mb->length());
+		if (this->configure(xml_config) != GADGET_OK) {
+			GADGET_DEBUG1("GadgetStream configuration failed\n");
+			mb->release();
+			return GADGET_FAIL;
+		} else {
+			mb->release();
+			return GADGET_OK;
+		}
+	}
+
+	ACE_Time_Value wait = ACE_OS::gettimeofday() + ACE_Time_Value(0,10000); //10ms from now
+	if (stream_.put(mb) == -1) {
+		GADGET_DEBUG2("Failed to put stuff on stream, too long wait, %d\n",  ACE_OS::last_error () ==  EWOULDBLOCK);
+		mb->release();
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+
+int GadgetStreamController::output_ready(ACE_Message_Block* mb) 
+{ 
+	int res = this->writer_task_.putq(mb);
+	return res;
+}
+
+
+
+int GadgetStreamController::handle_close (ACE_HANDLE, ACE_Reactor_Mask mask)
+{
+	GADGET_DEBUG1("handle_close called\n");
+
+	if (mask == ACE_Event_Handler::WRITE_MASK)
+		return 0;
+
+	GADGET_DEBUG1("Shutting down stream and closing up shop...\n");
+
+	this->stream_.close();
+
+	mask = ACE_Event_Handler::ALL_EVENTS_MASK |
+			ACE_Event_Handler::DONT_CALL;
+
+	this->reactor ()->remove_handler (this, mask);
+
+	//Empty output queue in case there is something on it.
+	int messages_dropped = this->msg_queue ()->flush();
+
+	if (messages_dropped) {
+		GADGET_DEBUG2("Flushed %d messages from output queue\n", messages_dropped);
+		this->reactor ()->handle_events(); //Flush any remaining events before we delete this Stream Controller
+	}
+
+	// Remove all readers and writers
+	//writers_.clear();
+	readers_.clear();
+
+	//Clear DLL handles (to make DLLs unload if needed)
+	for (unsigned int i = 0; i < dll_handles_.size(); i++) {
+#if defined WIN32
+		dll_handles_[i]->close(0); //On windows we will not unload the DLLs even when there are no more refs
+#else 
+		dll_handles_[i]->close(0); //On Unix/Mac it seems to be OK to do this
+#endif
+	}
+	dll_handles_.clear();
+
+	GADGET_DEBUG1("Stream is closed\n");
+
+	delete this;
+	return 0;
+}
+
+Gadget* GadgetStreamController::find_gadget(std::string gadget_name)
+{
+	GadgetModule* gm = stream_.find(gadget_name.c_str());
+
+	if (gm) {
+		Gadget* g = dynamic_cast<Gadget*>(gm->writer());
+		return g;
+	} else {
+		GADGET_DEBUG2("Gadget with name %s not found! Returning null pointer\n", gadget_name.c_str());
+	}
+
+	return 0;
+}
+
+int GadgetStreamController::configure_from_file(std::string config_xml_filename)
+{
+
+	char * gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+	ACE_TCHAR config_file_name[4096];
+	ACE_OS::sprintf(config_file_name, "%s/config/%s", gadgetron_home, config_xml_filename.c_str());
+
+	GADGET_DEBUG2("Running configuration: %s\n", config_file_name);
+
+	std::ifstream file (config_file_name, std::ios::in|std::ios::binary|std::ios::ate);
+	if (file.is_open())
+	{
+		size_t size = file.tellg();
+		char* buffer = new char [size];
+		if (!buffer) {
+			GADGET_DEBUG1("Unable to create temporary buffer for configuration file\n");
+			return GADGET_FAIL;
+		}
+		file.seekg (0, std::ios::beg);
+		file.read (buffer, size);
+		file.close();
+		std::string xml_file_contents(buffer,size);
+
+		return configure(xml_file_contents);
+		delete[] buffer;
+
+	} else {
+		GADGET_DEBUG2("Unable to open configuation file: %s\n", config_file_name);
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+int GadgetStreamController::configure(std::string config_xml_string)
+{
+
+	char * gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+	ACE_TCHAR schema_file_name[4096];
+	ACE_OS::sprintf(schema_file_name, "%s/schema/gadgetron.xsd", gadgetron_home);
+
+	std::string tmp(schema_file_name);
+	tmp = url_encode(tmp);
+	ACE_OS_String::strncpy(schema_file_name,tmp.c_str(), 4096);
+
+
+	xml_schema::properties props;
+	props.schema_location (
+	  "http://gadgetron.sf.net/gadgetron",
+	  std::string (schema_file_name));
+
+	std::istringstream str_stream(config_xml_string, std::stringstream::in);
+	std::auto_ptr<gadgetron::gadgetronStreamConfiguration> cfg;
+
+	ACE_TCHAR port_no[1024];
+	try {
+		cfg = std::auto_ptr<gadgetron::gadgetronStreamConfiguration>(gadgetron::gadgetronStreamConfiguration_(str_stream,0,props));
+		//cfg = std::auto_ptr<gadgetron::gadgetronStreamConfiguration>(gadgetron::gadgetronStreamConfiguration_(std::string(config_file_name)));
+	}  catch (const xml_schema::exception& e) {
+		GADGET_DEBUG2("Failed to parse Gadget Stream Configuration: %s\n", e.what());
+		return GADGET_FAIL;
+	}
+
+	GADGET_DEBUG2("Found %d readers\n", cfg->reader().size());
+	GADGET_DEBUG2("Found %d writers\n", cfg->writer().size());
+	GADGET_DEBUG2("Found %d gadgets\n", cfg->gadget().size());
+
+	for (gadgetron::gadgetronStreamConfiguration::reader_sequence::iterator i (cfg->reader().begin ()); i != cfg->reader().end(); ++i) {
+		long slot = 0;
+		std::string dllname("");
+		std::string classname("");
+
+		slot = i->slot();
+		dllname = i->dll();
+		classname = i->classname();
+
+		GADGET_DEBUG1("--Found reader declaration\n");
+		GADGET_DEBUG2("  Reader dll: %s\n", dllname.c_str());
+		GADGET_DEBUG2("  Reader class: %s\n", classname.c_str());
+		GADGET_DEBUG2("  Reader slot: %d\n", slot);
+
+		GadgetMessageReader* r =
+				load_dll_component<GadgetMessageReader>(dllname.c_str(),
+						classname.c_str());
+
+		if (!r) {
+			GADGET_DEBUG1("Failed to load GadgetMessageReader from DLL\n");
+			return GADGET_FAIL;
+		}
+
+		readers_.insert(slot, r);
+
+	}
+	//Configuration of readers end
+
+
+	//Configuration of writers
+	for (gadgetron::gadgetronStreamConfiguration::writer_sequence::iterator i (cfg->writer().begin ()); i != cfg->writer().end(); ++i) {
+		long slot = 0;
+		std::string dllname("");
+		std::string classname("");
+
+		slot = i->slot();
+		dllname = i->dll();
+		classname = i->classname();
+
+		GADGET_DEBUG1("--Found writer declaration\n");
+		GADGET_DEBUG2("  Reader dll: %s\n", dllname.c_str());
+		GADGET_DEBUG2("  Reader class: %s\n", classname.c_str());
+		GADGET_DEBUG2("  Reader slot: %d\n", slot);
+
+		GadgetMessageWriter* w =
+				load_dll_component<GadgetMessageWriter>(dllname.c_str(),
+						classname.c_str());
+
+		if (!w) {
+			GADGET_DEBUG1("Failed to load GadgetMessageWriter from DLL\n");
+			return GADGET_FAIL;
+		}
+
+		writer_task_.register_writer(slot, w);
+	}
+	//Configuration of writers end
+
+	//Let's configure the stream
+	GADGET_DEBUG2("Processing %d gadgets in reverse order\n",cfg->gadget().size());
+	for (gadgetron::gadgetronStreamConfiguration::gadget_sequence::reverse_iterator i (cfg->gadget().rbegin ()); i != cfg->gadget().rend(); ++i) {
+		std::string gadgetname("");
+		std::string dllname("");
+		std::string classname("");
+
+		gadgetname = i->name();
+		dllname = i->dll();
+		classname = i->classname();
+
+		GADGET_DEBUG1("--Found gadget declaration\n");
+		GADGET_DEBUG2("  Gadget Name: %s\n", gadgetname.c_str());
+		GADGET_DEBUG2("  Gadget dll: %s\n", dllname.c_str());
+		GADGET_DEBUG2("  Gadget class: %s\n", classname.c_str());
+
+		GadgetModule* m = create_gadget_module(dllname.c_str(),
+				classname.c_str(),
+				gadgetname.c_str());
+
+		if (!m) {
+			GADGET_DEBUG2("Failed to create GadgetModule from %s:%s\n",
+					classname.c_str(),
+					dllname.c_str());
+			return GADGET_FAIL;
+		}
+
+		Gadget* g = dynamic_cast<Gadget*>(m->writer());//Get the gadget out of the module
+
+		GADGET_DEBUG2("  Gadget parameters: %d\n", i->property().size());
+		for (gadgetron::gadget::property_sequence::iterator p (i->property().begin()); p != i->property().end(); ++p) {
+			std::string pname(p->name());
+			std::string pval(p->value());
+			GADGET_DEBUG2("Setting parameter %s = %s\n", pname.c_str(),pval.c_str());
+			g->set_parameter(pname.c_str(),pval.c_str(),false);
+		}
+
+		if (stream_.push(m) < 0) {
+			GADGET_DEBUG2("Failed to push Gadget %s onto stream\n", gadgetname.c_str());
+			delete m;
+			return GADGET_FAIL;
+		}
+
+	}
+
+	GADGET_DEBUG1("Gadget Stream configured\n");
+	stream_configured_ = true;
+
+	return GADGET_OK;
+}
+
+GadgetModule * GadgetStreamController::create_gadget_module(const char* DLL, 
+		const char* gadget,
+		const char* gadget_module_name)
+{
+
+	Gadget* g = load_dll_component<Gadget>(DLL,gadget);
+
+	if (!g) {
+		GADGET_DEBUG1("Failed to load gadget using factory\n");
+		return 0;
+	}
+
+	g->set_controller(this);
+
+	GadgetModule *module = 0;
+	ACE_NEW_RETURN (module,
+			GadgetModule (gadget_module_name, g),
+			0);
+
+	return module;
+}
+
+
+template <class T>  
+T* GadgetStreamController::load_dll_component(const char* DLL, const char* component_name)
+{
+	ACE_DLL_Manager* dllmgr = ACE_DLL_Manager::instance();
+
+	ACE_DLL_Handle* dll = 0;
+	ACE_SHLIB_HANDLE dll_handle = 0;
+
+	ACE_TCHAR dllname[1024];
+#if defined(WIN32) && defined(_DEBUG)
+	ACE_OS::sprintf(dllname, "%s%sd",ACE_DLL_PREFIX, DLL);
+#else
+	ACE_OS::sprintf(dllname, "%s%s",ACE_DLL_PREFIX, DLL);
+#endif
+
+	ACE_TCHAR factoryname[1024];
+	ACE_OS::sprintf(factoryname, "make_%s", component_name);
+
+	dll = dllmgr->open_dll (dllname, ACE_DEFAULT_SHLIB_MODE, dll_handle );
+
+	if (!dll) {
+		GADGET_DEBUG1("Failed to load DLL, Possible reasons: \n");
+		GADGET_DEBUG1("   * Name of DLL is wrong in XML file \n");
+		GADGET_DEBUG1("   * Path of DLL is not in your DLL search path (LD_LIBRARY_PATH on Unix)\n");
+		GADGET_DEBUG1("   * Path of other DLLs that this DLL depends on is not in the search path\n");
+		return 0;
+	} else {
+		dll_handles_.push_back(dll);
+	}
+
+	//Function pointer
+	typedef T* (*ComponentCreator) (void);
+
+	void *void_ptr = dll->symbol (factoryname);
+	ptrdiff_t tmp = reinterpret_cast<ptrdiff_t> (void_ptr);
+	ComponentCreator cc = reinterpret_cast<ComponentCreator> (tmp);
+
+	if (cc == 0) {
+		GADGET_DEBUG2("Failed to load factory (%s) from DLL (%s)\n", dllname, factoryname);
+		return 0;
+	}
+
+	T* c = cc();
+
+	if (!c) {
+		GADGET_DEBUG1("Failed to create component using factory\n");
+		return 0;
+	}
+
+	return c;
+}
diff --git a/apps/gadgetron/GadgetStreamController.h b/apps/gadgetron/GadgetStreamController.h
new file mode 100644
index 0000000..473e732
--- /dev/null
+++ b/apps/gadgetron/GadgetStreamController.h
@@ -0,0 +1,79 @@
+#ifndef GADGETSTREAMCONTROLLER_H
+#define GADGETSTREAMCONTROLLER_H
+
+#include "ace/Log_Msg.h"
+#include "ace/Reactor.h"
+#include "ace/SOCK_Stream.h"
+#include "ace/Stream.h"
+#include "ace/Message_Queue.h"
+#include "ace/Svc_Handler.h"
+#include "ace/Reactor_Notification_Strategy.h"
+
+#include <complex>
+#include <vector>
+
+#include "Gadgetron.h"
+#include "Gadget.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronConnector.h"
+
+typedef ACE_Module<ACE_MT_SYNCH> GadgetModule;
+
+namespace Gadgetron{
+
+
+class GadgetStreamController 
+: public ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_MT_SYNCH>
+{
+public:
+  GadgetStreamController()
+    : stream_configured_(false)
+    , notifier_ (0, this, ACE_Event_Handler::WRITE_MASK)
+  	, writer_task_(&this->peer())
+    { }
+
+  virtual ~GadgetStreamController()
+    { 
+      //ACE_DEBUG( (LM_INFO, ACE_TEXT("~GadgetStreamController() called\n")) );
+    }
+
+  //ACE_SOCK_Stream &peer (void) { return this->sock_; }
+
+  int open (void);
+
+  /*
+  virtual ACE_HANDLE get_handle (void) const { 
+    return this->sock_.get_handle (); 
+  }
+  */
+
+  virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+  //virtual int handle_output (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+  virtual int handle_close (ACE_HANDLE handle,
+                            ACE_Reactor_Mask close_mask);
+
+  virtual int output_ready(ACE_Message_Block* mb);
+
+  virtual Gadget* find_gadget(std::string gadget_name);
+
+private:
+  ACE_Stream<ACE_MT_SYNCH> stream_;
+  bool stream_configured_;
+  WriterTask writer_task_;
+
+  ACE_Reactor_Notification_Strategy notifier_;
+
+  GadgetMessageReaderContainer readers_;
+  
+  std::vector<ACE_DLL_Handle*> dll_handles_;
+
+  virtual int configure(std::string config_xml_string);
+  virtual int configure_from_file(std::string config_xml_filename);
+
+  virtual GadgetModule * create_gadget_module(const char* DLL, const char* gadget, const char* gadget_module_name);
+
+  template <class T>  T* load_dll_component(const char* DLL, const char* component_name);
+
+};
+}
+#endif //GADGETSTREAMCONTROLLER_H
diff --git a/apps/gadgetron/Gadgetron.h b/apps/gadgetron/Gadgetron.h
new file mode 100644
index 0000000..3fc9e2a
--- /dev/null
+++ b/apps/gadgetron/Gadgetron.h
@@ -0,0 +1,31 @@
+#ifndef GADGETRON_H
+#define GADGETRON_H
+
+#include "ace/Log_Msg.h"
+
+//#include "Gadget.h"
+//#include "GadgetContainerMessage.h"
+
+//Return messages
+#define GADGET_FAIL -1
+#define GADGET_OK    0
+
+
+//MACROS FOR LOGGING
+#define GADGET_DEBUG1(_fmt) \
+  ACE_DEBUG( (LM_DEBUG, \
+	      ACE_TEXT("[file %N, line %l] " _fmt)) ) 
+
+#define GADGET_DEBUG2(_fmt, ...) \
+  ACE_DEBUG( (LM_DEBUG, \
+	      ACE_TEXT("[file %N, line %l] " _fmt),	\
+	      __VA_ARGS__) )
+//MACROS FOR LOGGING
+#define GADGET_DEBUG_EXCEPTION(err, message); \
+	{std::string gdb ("[file %N, line %l] "); \
+	gdb += message; \
+	gdb += err.what(); \
+  ACE_DEBUG( (LM_DEBUG, \
+	      ACE_TEXT(gdb.c_str() )));}
+
+#endif  //GADGETRON_H
diff --git a/apps/gadgetron/GadgetronExport.h b/apps/gadgetron/GadgetronExport.h
new file mode 100644
index 0000000..d2c1c44
--- /dev/null
+++ b/apps/gadgetron/GadgetronExport.h
@@ -0,0 +1,38 @@
+#ifndef GADGETRONEXPORT_H
+#define GADGETRONEXPORT_H
+#pragma once
+
+#if defined (WIN32)
+#ifdef __BUILD_GADGETS__
+#define GADGETEXPORT __declspec(dllexport)
+#else
+#define GADGETEXPORT __declspec(dllimport)
+#endif
+#else
+#define GADGETEXPORT
+#endif
+
+//In header file add this macro
+#define GADGETRON_LOADABLE_DECLARE(COMPONENT)                   \
+  void *operator new (size_t bytes);                            \
+  void operator delete (void *ptr);                             \
+  void *operator new(size_t s, void * p) { return p; }
+
+//In CPP file add this macro add the end
+#define GADGETRON_LOADABLE_FACTORY_DECLARE(CLASS, COMPONENT)	\
+extern "C" GADGETEXPORT CLASS * make_##COMPONENT (void);        \
+CLASS * make_##COMPONENT (void)       				\
+{							       	\
+  return new COMPONENT;                                         \
+}                                                               \
+void * COMPONENT ::operator new (size_t bytes)                  \
+{                                                               \
+  return ::new char[bytes];                                     \
+}                                                               \
+void COMPONENT ::operator delete (void *ptr)                    \
+{                                                               \
+  delete [] static_cast <char *> (ptr);                         \
+} 
+
+
+#endif
diff --git a/apps/gadgetron/gadgetron.xml.example b/apps/gadgetron/gadgetron.xml.example
new file mode 100644
index 0000000..6b6f5f5
--- /dev/null
+++ b/apps/gadgetron/gadgetron.xml.example
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+  <port>9002</port>
+  
+</gadgetronConfiguration>
+  
\ No newline at end of file
diff --git a/apps/gadgetron/gadgetron_start.pl b/apps/gadgetron/gadgetron_start.pl
new file mode 100644
index 0000000..d9d6d7e
--- /dev/null
+++ b/apps/gadgetron/gadgetron_start.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+
+use Cwd 'abs_path';
+use FindBin '$Bin';
+$gadgetron_home = $Bin . "/../";
+
+print "gadgetron_home: $gadgetron_home\n";
+
+my $executable = "$gadgetron_home/bin/gadgetron";
+my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
+my $timestring = sprintf "%4d%02d%02d_%02d%02d%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec;
+
+print "Time string: $timestring\n";
+
+
+$ENV{'GADGETRON_HOME'} = $gadgetron_home;
+$ENV{'LD_LIBRARY_PATH'} = "/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cula/lib64:" . $gadgetron_home . "/lib";
+
+$exe_command = "killall -9 gadgetron";
+system($exe_command);
+sleep(1);
+
+$exe_command = "mkdir -p log";
+system($exe_command);
+
+$logfilename = "log/gadgetron_log_$timestring" . ".txt";
+$exe_command = "nohup $executable > $logfilename 2> $logfilename < /dev/null &" ;
+system($exe_command);
+
+sleep(1);
+
diff --git a/apps/gadgetron/main.cpp b/apps/gadgetron/main.cpp
new file mode 100644
index 0000000..50542ef
--- /dev/null
+++ b/apps/gadgetron/main.cpp
@@ -0,0 +1,94 @@
+#include "GadgetServerAcceptor.h"
+#include "FileInfo.h"
+#include "url_encode.h"
+#include "gadgetron.hxx" //Generated header file for XML configuration
+
+#include <ace/Log_Msg.h>
+#include <ace/Service_Config.h>
+#include <ace/Reactor.h>
+#include <ace/Get_Opt.h>
+#include <ace/OS_NS_string.h>
+#include <iostream>
+
+using namespace Gadgetron;
+
+void print_usage()
+{
+	ACE_DEBUG((LM_INFO, ACE_TEXT("Usage: \n") ));
+	ACE_DEBUG((LM_INFO, ACE_TEXT("gadgetron   -p <PORT>                      (default 9002)       \n") ));
+}
+
+int ACE_TMAIN(int argc, ACE_TCHAR *argv[])
+{
+	ACE_TRACE(( ACE_TEXT("main") ));
+	
+	ACE_LOG_MSG->priority_mask( LM_INFO | LM_NOTICE | LM_ERROR| LM_DEBUG,
+			ACE_Log_Msg::PROCESS);
+
+	char * gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+
+	if (std::string(gadgetron_home).size() == 0) {
+		ACE_ERROR_RETURN((LM_ERROR, ACE_TEXT("GADGETRON_HOME variable not set.\n")),-1);
+	}
+
+	std::string gcfg = std::string(gadgetron_home) + std::string("/config/gadgetron.xml");
+
+	if (!FileInfo(gcfg).exists()) {
+		ACE_ERROR_RETURN((LM_ERROR, ACE_TEXT("Gadgetron configuration file %s not found.\n"), gcfg.c_str()),-1);
+	}
+
+	ACE_TCHAR schema_file_name[4096];
+	ACE_OS::sprintf(schema_file_name, "%s/schema/gadgetron.xsd", gadgetron_home);
+
+	std::string tmp(schema_file_name);
+	tmp = url_encode(tmp);
+	ACE_OS_String::strncpy(schema_file_name,tmp.c_str(), 4096);
+
+	xml_schema::properties props;
+	props.schema_location (
+	  "http://gadgetron.sf.net/gadgetron",
+	  std::string (schema_file_name));
+
+	ACE_TCHAR port_no[1024];
+	try {
+		std::auto_ptr<gadgetron::gadgetronConfiguration> cfg(gadgetron::gadgetronConfiguration_(gcfg,0,props));
+		ACE_OS_String::strncpy(port_no, cfg->port().c_str(), 1024);
+	}  catch (const xml_schema::exception& e) {
+		std::cerr << e << std::endl;
+		ACE_DEBUG(( LM_DEBUG, ACE_TEXT("XML Parse Error: %s\n"), e.what() ));
+		ACE_ERROR_RETURN((LM_ERROR, ACE_TEXT("Error parsing configuration file %s.\n"), gcfg.c_str()),-1);
+	}
+
+	static const ACE_TCHAR options[] = ACE_TEXT(":p:");
+	ACE_Get_Opt cmd_opts(argc, argv, options);
+
+	int option;
+	while ((option = cmd_opts()) != EOF) {
+		switch (option) {
+		case 'p':
+			ACE_OS_String::strncpy(port_no, cmd_opts.opt_arg(), 1024);
+			break;
+		case ':':
+			print_usage();
+			ACE_ERROR_RETURN((LM_ERROR, ACE_TEXT("-%c requires an argument.\n"), cmd_opts.opt_opt()),-1);
+			break;
+		default:
+			print_usage();
+			ACE_ERROR_RETURN( (LM_ERROR, ACE_TEXT("Command line parse error\n")), -1);
+			break;
+		}
+	}
+
+
+	ACE_DEBUG(( LM_DEBUG, ACE_TEXT("%IConfiguring services, Running on port %s\n"), port_no ));
+
+	ACE_INET_Addr port_to_listen (port_no);
+	GadgetServerAcceptor acceptor;
+	acceptor.reactor (ACE_Reactor::instance ());
+	if (acceptor.open (port_to_listen) == -1)
+		return 1;
+
+	ACE_Reactor::instance()->run_reactor_event_loop ();
+
+	return 0;
+}
diff --git a/apps/gadgetron/schema/gadgetron.xsd b/apps/gadgetron/schema/gadgetron.xsd
new file mode 100644
index 0000000..cfe50ca
--- /dev/null
+++ b/apps/gadgetron/schema/gadgetron.xsd
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<xs:schema xmlns="http://gadgetron.sf.net/gadgetron" xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" targetNamespace="http://gadgetron.sf.net/gadgetron">
+
+  <xs:element name="gadgetronConfiguration">
+    <xs:complexType>
+      <xs:sequence>
+                <xs:element name="port" type="xs:string"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+
+  <xs:element name="gadgetronStreamConfiguration">
+    <xs:complexType>
+      <xs:sequence>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="reader">
+                	<xs:complexType>
+					      <xs:sequence>
+					      	<xs:element name="slot" type="xs:unsignedShort"/>
+					      	<xs:element name="dll" type="xs:string"/>
+					      	<xs:element name="classname" type="xs:string"/>
+					      </xs:sequence>
+          			</xs:complexType>
+        		</xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="writer">
+                	<xs:complexType>
+					      <xs:sequence>
+					      	<xs:element maxOccurs="1" minOccurs="1" name="slot" type="xs:unsignedShort"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+					      </xs:sequence>
+          			</xs:complexType>
+        		</xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="gadget">
+                	<xs:complexType>
+					      <xs:sequence>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="name" type="xs:string"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+					      	<xs:element maxOccurs="unbounded" minOccurs="0" name="property">
+					      		<xs:complexType>
+					      			<xs:sequence>
+								      	<xs:element maxOccurs="1" minOccurs="1" name="name" type="xs:string"/>
+								      	<xs:element maxOccurs="1" minOccurs="1" name="value" type="xs:string"/>
+					      			</xs:sequence>		
+					      		</xs:complexType>
+              				</xs:element>
+           				  </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+</xs:schema>
diff --git a/apps/gadgetron/templates/CMakeLists_GadgetLibraryExample.txt b/apps/gadgetron/templates/CMakeLists_GadgetLibraryExample.txt
new file mode 100644
index 0000000..d794865
--- /dev/null
+++ b/apps/gadgetron/templates/CMakeLists_GadgetLibraryExample.txt
@@ -0,0 +1,55 @@
+cmake_minimum_required(VERSION 2.6)
+
+project(EXAMPLELIB)
+
+if (WIN32)
+ADD_DEFINITIONS(-DWIN32 -D_WIN32 -D_WINDOWS)
+ADD_DEFINITIONS(-DUNICODE -D_UNICODE)
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3")
+endif (WIN32)
+
+###############################################################
+#Bootstrap search for libraries 
+# (We need to find cmake modules in Gadgetron)
+###############################################################
+find_path(GADGETRON_CMAKE_MODULES FindGadgetron.cmake HINTS
+$ENV{GADGETRON_HOME}/cmake
+/usr/local/gadgetron)
+
+if (NOT GADGETRON_CMAKE_MODULES)
+  MESSAGE(FATAL_ERROR "GADGETRON_CMAKE_MODULES cannot be found. 
+   Try to set GADGETRON_HOME environment variable.")
+endif(NOT GADGETRON_CMAKE_MODULES)
+
+set(CMAKE_MODULE_PATH ${GADGETRON_CMAKE_MODULES})
+###############################################################
+
+find_package(Gadgetron REQUIRED)
+find_package(Boost REQUIRED)
+find_package(ACE REQUIRED)
+find_package(Ismrmrd REQUIRED)
+
+
+set(CMAKE_INSTALL_PREFIX ${GADGETRON_HOME})
+
+INCLUDE_DIRECTORIES(${ACE_INCLUDE_DIR} 
+     ${Boost_INCLUDE_DIR}
+     ${GADGETRON_INCLUDE_DIR}
+     ${ISMRMRD_INCLUDE_DIR}
+	 ${XSD_INCLUDE_DIR})
+
+LINK_DIRECTORIES(${GADGETRON_LIB_DIR})
+
+ADD_LIBRARY(gadgetronEXAMPLELIB SHARED <<CPPFILES>> )
+
+TARGET_LINK_LIBRARIES(gadgetronEXAMPLELIB 
+                      cpucore
+                      ${ISMRMRD_LIBRARIES} ${ISMRMRD_XSD_LIBRARIES}
+                      optimized ${ACE_LIBRARIES} 
+                      debug ${ACE_DEBUG_LIBRARY})
+
+INSTALL(TARGETS gadgetronEXAMPLELIB DESTINATION lib)
+
+#INSTALL(FILES <<XMLFILES>> DESTINATION config)
+#INSTALL (FILES <<HEADERFILES>> gadgetronEXAMPLELIB_export.h DESTINATION include)
diff --git a/apps/gadgetron/templates/gadgetronEXAMPLELIB_export.h b/apps/gadgetron/templates/gadgetronEXAMPLELIB_export.h
new file mode 100644
index 0000000..e5b5810
--- /dev/null
+++ b/apps/gadgetron/templates/gadgetronEXAMPLELIB_export.h
@@ -0,0 +1,21 @@
+/*
+ * gadgetronEXAMPLELIB_export.h
+ *
+ */
+
+#ifndef GADGETRONEXAMPLELIB_EXPORT_H_
+#define GADGETRONEXAMPLELIB_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_EXAMPLELIB__) || defined (gadgetronEXAMPLELIB_EXPORTS)
+#define EXPORTGADGETSEXAMPLELIB __declspec(dllexport)
+#else
+#define EXPORTGADGETSEXAMPLELIB __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSEXAMPLELIB
+#endif
+
+
+#endif /* GADGETRONEXAMPLELIB_EXPORT_H_ */
diff --git a/apps/gadgetron/upstart/gadgetron.conf b/apps/gadgetron/upstart/gadgetron.conf
new file mode 100644
index 0000000..e957c65
--- /dev/null
+++ b/apps/gadgetron/upstart/gadgetron.conf
@@ -0,0 +1,36 @@
+description "Gadgetron Upstart Script - Starts and Stops Gadgetron server"
+version "1.0"
+author "Michael S. Hansen (michael.hansen at nih.gov)"
+
+start on filesystem or runlevel [2345]
+stop on runlevel [!2345]
+
+expect fork
+
+# configuration variables.
+env GADGETRON_HOME=/usr/local/gadgetron
+env GADGETRON_USER=gadgetron
+
+#Log output to log file (/var/log/upstart/gadgetron.log)
+console log
+
+pre-start script
+#We will make the log file world readable to make it easier for users (without sudo privileges) to monitor
+touch /var/log/upstart/gadgetron.log
+chmod o+r /var/log/upstart/gadgetron.log
+end script
+
+script
+
+export LD_LIBRARY_PATH="${GADGETRON_HOME}/lib:/usr/local/ismrmrd/lib:/usr/local/cuda/lib64:/usr/local/cula/lib64"
+export PATH=$PATH:${GADGETRON_HOME}/bin
+
+#Start as GADGETRON_USER
+exec su -s /bin/sh -c ${GADGETRON_HOME}/bin/gadgetron ${GADGETRON_USER} &
+
+# create a custom event in case we want to chain later
+emit gadgetron_running
+end script
+
+#respawn if process dies or is killed
+respawn
diff --git a/apps/gadgetron/webapp/gadgetron_web.conf b/apps/gadgetron/webapp/gadgetron_web.conf
new file mode 100644
index 0000000..14a9e7a
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web.conf
@@ -0,0 +1,15 @@
+description     "Foobar management daemon"
+author          "Alex Smith"
+
+start on started network
+stop on stopping network
+stop on starting shutdown
+
+console output
+kill signal INT
+
+exec su -c "python /usr/local/gadgetron/bin/gadgetron_web_app.py /usr/local/gadgetron/config/gadgetron_web_app.cfg" hansenms
+
+respawn
+
+respawn
diff --git a/apps/gadgetron/webapp/gadgetron_web_app.cfg b/apps/gadgetron/webapp/gadgetron_web_app.cfg
new file mode 100644
index 0000000..03c8726
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web_app.cfg
@@ -0,0 +1,8 @@
+[WEBSERVER]
+port=8090
+
+[GADGETRON]
+port=9002
+GADGETRON_HOME=/usr/local/gadgetron
+ISMRMRD_HOME=/usr/local/ismrmrd
+logfile=/tmp/gadgetron.log
\ No newline at end of file
diff --git a/apps/gadgetron/webapp/gadgetron_web_app.py b/apps/gadgetron/webapp/gadgetron_web_app.py
new file mode 100644
index 0000000..9b98ae8
--- /dev/null
+++ b/apps/gadgetron/webapp/gadgetron_web_app.py
@@ -0,0 +1,176 @@
+from twisted.web import server, resource, static
+from twisted.internet import reactor
+
+import subprocess
+import time
+import sys
+import ConfigParser
+import os
+import platform
+import threading 
+import signal
+import psutil
+import inspect
+
+run_gadgetron_check = True
+
+def ctrlcsignal(signal, frame):
+    global reactor
+    global run_gadgetron_check
+    print "Shutting down server (SIGINT)"
+    run_gadgetron_check = False
+    reactor.stop()
+
+def termsignal(signal, frame):
+    global reactor
+    global run_gadgetron_check
+    print "Shutting down server (TERM)"
+    run_gadgetron_check = False
+    reactor.stop()
+
+def isGadgetronAlive(port,environment):
+    process = subprocess.Popen(["gt_alive","localhost",str(port)], env=environment)
+    
+    time.sleep(1)
+    ret = process.poll()
+    if ret == None:
+        #Process is hanging
+        process.kill()
+        return -1
+    elif ret != 0:
+        #Failed to connect
+        return -1
+    else:
+        return 0
+
+
+class GadgetronResource(resource.Resource):
+    isLeaf = True
+    numberRequests = 0
+    gadgetron_log_filename = 'gadgetron_log.txt'
+    gadgetron_process = 0
+    environment = 0;
+    gadgetron_port = 9002
+    check_thread = 0
+    run_gadgetron_check = True
+    process_lock = threading.Lock()
+
+    def __init__(self, cfgfilename):
+        config = ConfigParser.RawConfigParser()
+        config.read(cfgfilename)
+        gadgetron_home = config.get('GADGETRON', 'GADGETRON_HOME')
+        ismrmrd_home = config.get('GADGETRON', 'ISMRMRD_HOME')
+        self.gadgetron_log_filename = config.get('GADGETRON','logfile')
+        self.gadgetron_port = config.get('GADGETRON','port')
+        gf = open(self.gadgetron_log_filename,"w")
+        
+        self.environment = dict()
+        self.environment["GADGETRON_HOME"]=gadgetron_home
+        self.environment["PATH"]=self.environment["GADGETRON_HOME"] + "/bin"
+
+        if (platform.system() == 'Linux'):
+            self.environment["LD_LIBRARY_PATH"]="/usr/local/cuda/lib64:/usr/local/cula/lib64:" +  self.environment["GADGETRON_HOME"] + "/lib:" + ismrmrd_home + "/lib"  
+        elif (platform.system() == 'Darwin'):
+            self.environment["DYLD_LIBRARY_PATH"]="/usr/local/cuda/lib64:/usr/local/cula/lib64:" +  self.environment["GADGETRON_HOME"] + "/lib:" + ismrmrd_home + "/lib:/opt/local/lib"  
+
+        #self.process_lock.acquire()
+        self.gadgetron_process = subprocess.Popen(["gadgetron","-p",self.gadgetron_port], env=self.environment,stdout=gf,stderr=gf)
+        #self.process_lock.release()
+        resource.Resource.__init__(self)
+        
+        self.check_thread = threading.Thread(target=self.check_gadgetron)
+        self.check_thread.start()
+
+    def __del__(self):
+        self.run_gadgetron_check = False
+        self.check_thread.join()
+        self.gadgetron_process.terminate()
+
+    def restart_gadgetron(self):
+        self.process_lock.acquire()
+        s = self.gadgetron_process.poll()
+        if (s == None):
+            self.gadgetron_process.kill()
+            time.sleep(2)
+        gf = open(self.gadgetron_log_filename,"w")
+        self.gadgetron_process = subprocess.Popen(["gadgetron","-p",self.gadgetron_port], env=self.environment,stdout=gf,stderr=gf)
+        time.sleep(2)
+        self.process_lock.release()
+
+    def check_gadgetron(self):
+        global run_gadgetron_check
+        while (run_gadgetron_check):
+            self.process_lock.acquire()
+            s = self.gadgetron_process.poll()
+            self.process_lock.release()
+            if (s != None):
+                self.restart_gadgetron()
+            time.sleep(3)
+        
+
+    def render_page(self):
+        doc = "<html>\n<body>\n"
+        doc += "<h1>Gadgetron Monitor</h1>\n"
+
+        alive = (isGadgetronAlive(self.gadgetron_port,self.environment) == 0)
+
+        doc += "<div>Gadgetron Status: "
+
+        if (alive):
+            doc += "<span style=\"color: green;\">[OK]</span></div>"
+        else:
+            doc += "<span style=\"color: red;\">[UNRESPONSIVE]</span></div>"
+            
+        doc += "<div><p><span><form method=\"POST\"><input type=\"submit\" value=\"RESTART\"><input type=\"hidden\" name=\"command\" value=\"restart\"></form></span></div>"
+        doc += "<div><p><span><form method=\"POST\"><input type=\"submit\" value=\"REFRESH\"><input type=\"hidden\" name=\"command\" value=\"refresh\"></form></span></div>"
+        if (alive):
+            p = psutil.Process(self.gadgetron_process.pid)
+            doc += "<div><ul>"
+            doc += "<li>Process ID: " + str(self.gadgetron_process.pid) + "</li>"
+            doc += "<li>CPU Percent: " + str(round(p.get_cpu_percent(),2)) + "</li>"
+            doc += "<li>Memory Percent: " + str(round(p.get_memory_percent(),2)) + "</li>"
+            doc += "</ul></div>"
+
+            doc += "<div><iframe width=\"1024\" height=\"768\" src=\"/log\"></iframe></div>" 
+        
+        doc += "</body>\n</html>"
+        return doc
+
+        
+    def render_GET(self, request):
+        return self.render_page()
+        
+    def render_POST(self, request):
+        if 'command' in request.args:
+            if request.args['command'] == ['restart']:
+                print "Restarting Gadgetron"
+                self.restart_gadgetron()
+
+        return self.render_page()
+
+class GadgetronLogResource(resource.Resource):
+    filename = ""
+
+    def __init__(self, logfilename):
+        self.filename = logfilename
+        resource.Resource.__init__(self)
+
+    def render_GET(self, request):
+        gf = open(self.filename,"r")
+        l = gf.read()
+        return "<html><body><pre style=\"font-size: 8px\">" + l + "</pre></body></html>"
+
+config = ConfigParser.RawConfigParser()
+config.read(sys.argv[1])
+gadgetron_home = config.get('GADGETRON', 'GADGETRON_HOME')
+port = int(config.get('WEBSERVER','port'))
+
+root = resource.Resource()
+root.putChild('gadgetron',GadgetronResource(sys.argv[1]))
+root.putChild('log', GadgetronLogResource(config.get('GADGETRON','logfile')))
+
+signal.signal(signal.SIGINT, ctrlcsignal)
+signal.signal(signal.SIGHUP, termsignal)
+
+reactor.listenTCP(port, server.Site(root))
+reactor.run()
diff --git a/apps/matlab/mexGT.h b/apps/matlab/mexGT.h
new file mode 100644
index 0000000..3c61ea7
--- /dev/null
+++ b/apps/matlab/mexGT.h
@@ -0,0 +1,580 @@
+/*
+ * @(#)mex.h    generated by: makeheader 4.21  Fri Apr 23 18:16:45 2004
+ *
+ *		built from:	../../src/include/copyright.h
+ *				../../src/include/pragma_interface.h
+ *				mex_typedefs.h
+ *				./fmexapi.cpp
+ *				./fmexapiv5.cpp
+ *				./globals.cpp
+ *				./mexapi.cpp
+ *				./mexapiv4.cpp
+ *				./mexapiv5.cpp
+ *				./mexcbk.cpp
+ *				./mexdispatch.cpp
+ *				./mexintrf.cpp
+ *				mexdbg.h
+ */
+
+#ifndef mex_h
+#define mex_h
+
+
+/*
+ * Copyright 1984-2003 The MathWorks, Inc.
+ * All Rights Reserved.
+ */
+
+
+
+/*
+ * Prevent g++ from making copies of vtable and typeinfo data
+ * in every compilation unit.  By allowing for only one, we can
+ * save space and prevent some situations where the linker fails
+ * to coalesce them properly into a single entry.
+ *
+ * References:
+ *    http://gcc.gnu.org/onlinedocs/gcc/Vague-Linkage.html#Vague%20Linkage
+ *    http://gcc.gnu.org/onlinedocs/gcc/C---Interface.html
+ */
+
+#ifdef __cplusplus
+#  ifdef GLNX86
+#    pragma interface
+#  endif
+#endif
+
+
+
+/* $Revision: 1.7 $ */
+#ifndef mex_typedefs_h
+#define mex_typedefs_h
+typedef struct impl_info_tag *MEX_impl_info;
+
+#include "matrix.h"
+
+typedef struct mexGlobalTableEntry_Tag
+{
+    const char *name;             /* The name of the global */
+    mxArray    **variable;        /* A pointer to the variable */ 
+} mexGlobalTableEntry, *mexGlobalTable;
+
+#if defined(MSWIND)
+#define cicompare(s1,s2) utStrcmpi((s1),(s2))
+#else
+#define cicompare(s1,s2) strcmp((s1),(s2))
+#endif
+#define cscompare(s1,s2) strcmp((s1),(s2))
+
+typedef struct mexFunctionTableEntry_tag {
+  const char *  name;
+  mxFunctionPtr f;
+  int           nargin;
+  int           nargout;
+  struct _mexLocalFunctionTable *local_function_table;
+} mexFunctionTableEntry, *mexFunctionTable;
+
+typedef struct _mexLocalFunctionTable {
+  size_t           length;
+  mexFunctionTable entries;
+} _mexLocalFunctionTable, *mexLocalFunctionTable;
+
+typedef struct {
+  void (*initialize)(void);
+  void (*terminate)(void);
+} _mexInitTermTableEntry, *mexInitTermTableEntry;
+
+#define MEX_INFORMATION_VERSION 1
+
+typedef struct {
+  int                   version;
+  int                   file_function_table_length;
+  mexFunctionTable      file_function_table;
+  int                   global_variable_table_length;
+  mexGlobalTable        global_variable_table;
+  int                   npaths;
+  const char **         paths;
+  int                   init_term_table_length;
+  mexInitTermTableEntry init_term_table;
+} _mex_information, *mex_information;
+
+typedef mex_information(*fn_mex_file)(void);
+
+typedef void (*fn_clean_up_after_error)(void);
+typedef const char *(*fn_simple_function_to_string)(mxFunctionPtr f);
+
+typedef void (*fn_mex_enter_mex_library)(mex_information x);
+typedef fn_mex_enter_mex_library fn_mex_exit_mex_library;
+
+typedef mexLocalFunctionTable (*fn_mex_get_local_function_table)(void);
+typedef mexLocalFunctionTable (*fn_mex_set_local_function_table)(mexLocalFunctionTable);
+
+#endif
+
+
+/*
+ * This header file "mex.h" declares all the types, macros and
+ * functions necessary to interface mex files with the current
+ * version of MATLAB.  See the release notes for information on 
+ * supporting syntax from earlier versions.
+ */  
+#include "matrix.h"
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern"C"__declspec(dllexport)
+#endif
+
+void mexFunction(
+    int           nlhs,           /* number of expected outputs */
+    mxArray       *plhs[],        /* array of pointers to output arguments */
+    int           nrhs,           /* number of inputs */
+    const mxArray *prhs[]         /* array of pointers to input arguments */
+);
+#ifdef __cplusplus
+#endif
+
+/*#ifdef __cpluslus
+#define _MEXFUNCTION extern"C"__declspec(dllexport)
+#else
+#define _MEXFUNCTION __declspec(dllexport)
+#endif
+
+_MEXFUNCTION void mexFunction(
+    int           nlhs,           
+    mxArray       *plhs[],        
+    int           nrhs,           
+    const mxArray *prhs[]         
+);*/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Issue error message and return to MATLAB prompt
+ */
+extern void mexErrMsgTxt(
+    const char	*error_msg	/* string with error message */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Issue formatted error message with corresponding error identifier and return to MATLAB
+ * prompt.
+ */
+extern void mexErrMsgIdAndTxt(
+    const char * identifier, /* string with error message identifier */
+    const char * err_msg,    /* string with error message printf-style format */
+    ...                      /* any additional arguments */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Invoke an unidentified warning. Such warnings can only be affected by the M-code
+ * 'warning * all', since they have no specific identifier. See also mexWarnMsgIdAndTxt.
+ */
+extern void mexWarnMsgTxt(
+    const char	*warn_msg	/* string with warning message */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Invoke a warning with message identifier 'identifier' and message derived from 'fmt' and
+ * subsequent arguments. The warning may either get printed as is (if it is set to 'on'), or
+ * not actually get printed (if set to 'off'). See 'help warning' in MATLAB for more
+ * details.
+ */
+extern void mexWarnMsgIdAndTxt(
+    const char * identifier,    /* string with warning message identifer */
+    const char * warn_msg,	/* string with warning message printf-style format */
+    ...                         /* any additional arguments */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * mex equivalent to MATLAB's "disp" function
+ */
+extern int mexPrintf(
+    const char	*fmt,	/* printf style format */
+    ...				/* any additional arguments */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+
+#define printf mexPrintf
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Remove all components of an array plus the array header itself
+ * from MATLAB's memory allocation list.  The array will now
+ * persist between calls to the mex function.  To destroy this
+ * array, you will need to explicitly call mxDestroyArray().
+ */
+extern void mexMakeArrayPersistent(
+    mxArray *pa              /* pointer to array */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Remove memory previously allocated via mxCalloc from MATLAB's
+ * memory allocation list.  To free this memory, you will need to
+ * explicitly call mxFree().
+ */
+extern void mexMakeMemoryPersistent(void *ptr);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Look up a function and return an opaque handle for use with
+ * mexCallMATLABFunction.
+ */
+extern void mexGetFunctionHandle(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Call a function whose handle was determined by mexGetFunctionHandle.
+ */
+extern void mexCallMATLABFunction(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Register a function pointer as a MATLAB-callable function.
+ */
+extern void mexRegisterFunction(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * mex equivalent to MATLAB's "set" function
+ */
+extern int mexSet(double handle, const char *property, mxArray *value);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* API interface which mimics the "get" function */
+extern const mxArray *mexGet(double handle, const char *property);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * call MATLAB function
+ */
+extern int mexCallMATLAB(
+    int		nlhs,			/* number of expected outputs */
+    mxArray	*plhs[],		/* pointer array to outputs */
+    int		nrhs,			/* number of inputs */
+    mxArray	*prhs[],		/* pointer array to inputs */
+    const char	*fcn_name		/* name of function to execute */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * set or clear mexCallMATLAB trap flag (if set then an error in  
+ * mexCallMATLAB is caught and mexCallMATLAB will return a status value, 
+ * if not set an error will cause control to revert to MATLAB)
+ */
+extern void mexSetTrapFlag(int flag);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Perform in-place subscript assignment.
+ */
+extern void mexSubsAssign(
+      mxArray *plhs, /* pointer to lhs, to be modified in-place */
+      const mxArray *prhs, /* pointer to rhs */
+      const mxArray *subs[], /* array of subscripts for lhs */
+      int nsubs     /* number os subscripts */
+      );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Retrieve a specified subset of an array.
+ */
+extern mxArray *mexSubsReference(
+      const mxArray *prhs, /* pointer to rhs */
+      const mxArray *subs[], /* array of subscripts for rhs */
+      int nsubs /* number of subscripts */
+      );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Print an assertion-style error message and return control to the
+ * MATLAB command line.
+ */ 
+extern void mexPrintAssertion(
+		const char *test, 
+		const char *fname, 
+		int linenum, 
+		const char *message);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Tell whether or not a mxArray is in MATLAB's global workspace.
+ */
+extern bool mexIsGlobal(const mxArray *pA);
+#ifdef __cplusplus
+}
+#endif
+
+
+#define mexGetGlobal()    mexGetGlobal_is_obsolete
+#define mxSetString()     mxSetString_is_obsolete
+#define mxSetDispMode()   mxSetDispMode_is_obsolete
+#define mexGetMatrixPtr() mexGetMatrixPtr_is_obsolete
+#define mexGetMatrix()    mexGetMatrix_is_obsolete
+#define mexPutMatrix()    mexPutMatrix_is_obsolete
+#define mexPutFull()      mexPutFull_is_obsolete
+#define mexGetFull()      mexGetFull_is_obsolete
+#define mexGetEps()       mexGetEps_is_obsolete
+#define mexGetInf()       mexGetInf_is_obsolete
+#define mexGetNaN()       mexGetNaN_is_obsolete
+#define mexIsFinite()     mexIsFinite_is_obsolete
+#define mexIsInf()        mexIsInf_is_obsolete
+#define mexIsNaN()        mexIsNaN_is_obsolete
+
+
+/*
+ * mexAddFlops is no longer allowed.  
+ */
+#define mexAddFlops(x) mexAddFlops_is_obsolete
+
+#if defined(V5_COMPAT)
+#define mexPutArray(parray, workspace) mexPutVariable(workspace, mxGetName(parray), parray)
+#define mexGetArray(name, workspace) mexGetVariable(workspace, name)
+#define mexGetArrayPtr(name, workspace) mexGetVariablePtr(workspace, name)
+#else
+#define mexPutArray() mexPutArray_is_obsolete
+#define mexGetArray() mexGetArray_is_obsolete
+#define mexGetArrayPtr() mexGetArrayPtr_is_obsolete
+#endif /* defined(V5_COMPAT) */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Place a copy of the array value into the specified workspace with the
+ * specified name
+ */
+extern int mexPutVariable(
+    const char *workspace,
+    const char *name,
+    const mxArray *parray		/* matrix to copy */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * return a pointer to the array value with the specified variable
+ * name in the specified workspace
+ */
+extern const mxArray *mexGetVariablePtr(
+    const char *workspace,
+    const char *name		/* name of symbol */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * return a copy of the array value with the specified variable
+ * name in the specified workspace
+ */
+extern mxArray *mexGetVariable(
+    const char	*workspace,		
+    const char  *name                /* name of variable in question */
+    );
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Lock a MEX-function so that it cannot be cleared from memory.
+ */
+extern void mexLock(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Unlock a locked MEX-function so that it can be cleared from memory.
+ */
+extern void mexUnlock(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Return true if the MEX-function is currently locked, false otherwise.
+ */
+extern bool mexIsLocked(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Return the name of a the MEXfunction currently executing.
+ */
+extern const char *mexFunctionName(void);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Parse and execute MATLAB syntax in string.  Returns zero if successful,
+ * and a non zero value if an error occurs.
+ */
+extern int mexEvalString(
+   const char *str	   /* matlab command string */
+);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Register Mex-file's At-Exit function (accessed via MEX callback)
+ */
+extern int mexAtExit(
+    void	(*exit_fcn)(void)
+    );
+#ifdef __cplusplus
+}
+#endif
+
+
+#define NEW_DISPATCHER_EVAL_CALLER 1
+
+
+/* $Revision: 1.9 $ */
+#ifdef ARGCHECK
+
+#include "mwdebug.h" /* Prototype _d versions of API functions */
+
+#define mexAtExit(exitfcn) 				mexAtExit_d(exitfcn, __FILE__, __LINE__)
+#define mexCallMATLAB(nlhs, plhs, nrhs, prhs, fcn) mexCallMATLAB_d(nlhs, plhs, nrhs, prhs, fcn, __FILE__, __LINE__)
+#define mexErrMsgTxt(errmsg)			mexErrMsgTxt_d(errmsg, __FILE__, __LINE__)
+#define mexEvalString(str) 				mexEvalString_d(str, __FILE__, __LINE__)
+#define mexGet(handle, property) 		mexGet_d(handle, property, __FILE__, __LINE__)
+#define mexGetVariable(workspace, name) 	mexGetVariable_d(workspace, name, __FILE__, __LINE__)
+#define mexGetVariablePtr(workspace, name)      mexGetVariablePtr_d(workspace, name, __FILE__, __LINE__)
+#define mexIsGlobal(pa)                 mexIsGlobal_d(pa, __FILE__, __LINE__)
+#define mexMakeArrayPersistent(pa) 		mexMakeArrayPersistent_d(pa, __FILE__, __LINE__)              
+#define mexMakeMemoryPersistent(ptr) 	mexMakeMemoryPersistent_d(ptr, __FILE__, __LINE__)
+#define mexPutVariable(workspace, name, pa) 	mexPutVariable_d(workspace, name, pa, __FILE__, __LINE__)
+#define mexSet(handle, property, value) mexSet_d(handle, property, value, __FILE__, __LINE__)
+#define mexSetTrapFlag(value)           mexSetTrapFlag_d(value, __FILE__, __LINE__)
+#define mexSubsAssign(plhs, sub, nsubs, rhs)    mexSubsAssign_d(plhs, sub, nsubs, rhs, __FILE__, __LINE__)
+#define mexSubsReference(prhs, sub, nsubs)    mexSubsReference_d(prhs, sub, nsubs, __FILE__, __LINE__)
+#define mexWarnMsgTxt(str)		 		mexWarnMsgTxt_d(str, __FILE__, __LINE__)
+#endif
+
+#endif /* mex_h */
diff --git a/apps/standalone/CMakeLists.txt b/apps/standalone/CMakeLists.txt
new file mode 100644
index 0000000..1bb04a3
--- /dev/null
+++ b/apps/standalone/CMakeLists.txt
@@ -0,0 +1,7 @@
+if (ARMADILLO_FOUND)
+  add_subdirectory(cpu)
+endif (ARMADILLO_FOUND)
+
+if (CUDA_FOUND)
+  add_subdirectory(gpu)
+endif(CUDA_FOUND)
diff --git a/apps/standalone/cpu/CMakeLists.txt b/apps/standalone/cpu/CMakeLists.txt
new file mode 100644
index 0000000..a00ffab
--- /dev/null
+++ b/apps/standalone/cpu/CMakeLists.txt
@@ -0,0 +1,35 @@
+include_directories( 
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIR}
+    ${FFTW3_INCLUDE_DIR}
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+    ${CMAKE_SOURCE_DIR}/gadgets/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/matlab
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+    ${CMAKE_SOURCE_DIR}/apps/matlab
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+    ${CMAKE_SOURCE_DIR}/gadgets/gtPlus 
+)
+
+#add_subdirectory(MRI)
+add_subdirectory(denoising)
+#add_subdirectory(deblurring)
+add_subdirectory(registration)
+add_subdirectory(gtplus)
\ No newline at end of file
diff --git a/apps/standalone/cpu/denoising/2d/CMakeLists.txt b/apps/standalone/cpu/denoising/2d/CMakeLists.txt
new file mode 100644
index 0000000..ed75345
--- /dev/null
+++ b/apps/standalone/cpu/denoising/2d/CMakeLists.txt
@@ -0,0 +1,25 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+include_directories( 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+  )
+
+add_executable(cpu_denoise_TV denoise_TV.cpp)
+
+target_link_libraries(cpu_denoise_TV 
+  cpucore 
+  cpucore_math 
+  hostutils
+  ${ARMADILLO_LIBRARIES}
+  )
+
+install(TARGETS cpu_denoise_TV DESTINATION bin)
diff --git a/apps/standalone/cpu/denoising/2d/denoise_TV.cpp b/apps/standalone/cpu/denoising/2d/denoise_TV.cpp
new file mode 100644
index 0000000..0a82d3d
--- /dev/null
+++ b/apps/standalone/cpu/denoising/2d/denoise_TV.cpp
@@ -0,0 +1,117 @@
+/*
+  Total variation denoising based on the paper 
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+*/
+
+// Gadgetron includes
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoSbCgSolver.h"
+#include "hoIdentityOperator.h"
+#include "hoPartialDerivativeOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Noisy image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "denoised_image_TV.real" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "10" );
+  parms.add_parameter( 'm', COMMAND_LINE_FLOAT,  1, "Regularization weight (mu)", true, "25.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running denoising with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_real> > data = 
+    read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+
+  if( !data.get() ){
+    cout << endl << "Input image not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( data->get_number_of_dimensions() != 2 ){
+    cout << endl << "Input image is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  _real mu = (_real) parms.get_parameter('m')->get_float_value();
+  _real lambda = (_real)2.0*mu; // This is a good alround setting according to Goldstein et al.
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  size_t num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  size_t num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< hoPartialDerivativeOperator<_real,2> > Rx( new hoPartialDerivativeOperator<_real,2>(0) );
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data->get_dimensions().get());
+  Rx->set_codomain_dimensions(data->get_dimensions().get());
+  
+  boost::shared_ptr< hoPartialDerivativeOperator<_real,2> > Ry( new hoPartialDerivativeOperator<_real,2>(1) );
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data->get_dimensions().get());
+  Ry->set_codomain_dimensions(data->get_dimensions().get());
+  
+  // Define encoding operator (identity)
+  boost::shared_ptr< identityOperator<hoNDArray<_real> > > E( new identityOperator<hoNDArray<_real> >() );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data->get_dimensions().get());
+  E->set_codomain_dimensions(data->get_dimensions().get());
+  
+  // Setup split-Bregman solver
+  hoSbCgSolver<_real> sb;
+  sb.set_encoding_operator( E );
+  //sb.add_regularization_operator( Rx ); // Anisotropic denoising
+  //sb.add_regularization_operator( Ry ); // Anisotropic denoising
+  sb.add_regularization_group_operator( Rx ); // Isotropic denoising
+  sb.add_regularization_group_operator( Ry); // Isotropic denoising
+  sb.add_group();
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( hoCgSolver<_real>::OUTPUT_VERBOSE );
+  
+  // Setup inner conjugate gradient solver
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( hoCgSolver<_real>::OUTPUT_WARNINGS );
+  
+  // Run split-Bregman solver
+  boost::shared_ptr< hoNDArray<_real> > sbresult = sb.solve(data.get());
+  
+  // All done, write out the result
+  write_nd_array<_real>(sbresult.get(), (char*)parms.get_parameter('r')->get_string_value());
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/denoising/CMakeLists.txt b/apps/standalone/cpu/denoising/CMakeLists.txt
new file mode 100644
index 0000000..5c4cec9
--- /dev/null
+++ b/apps/standalone/cpu/denoising/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(2d)
diff --git a/apps/standalone/cpu/gtplus/CMakeLists.txt b/apps/standalone/cpu/gtplus/CMakeLists.txt
new file mode 100644
index 0000000..7bab6b2
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/CMakeLists.txt
@@ -0,0 +1,56 @@
+
+# matlab warpper
+if (MATLAB_FOUND)
+
+    message("MATLAB FOUND: matlab wrapper for gtplus toolbox will be compiled.")
+
+    SET(CMAKE_DEBUG_POSTFIX)
+
+    include_directories( ${MATLAB_INCLUDE_DIR}  ${ISMRMRD_INCLUDE_DIR} )
+
+    link_directories(${Boost_LIBRARY_DIRS})
+    link_libraries(${MATLAB_LIBRARIES} 
+                    optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+                    gtplus 
+                    cpucore 
+                    cpucore_math)
+
+    if (WIN32)
+        if ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexw64")
+        else ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexw32")
+        endif ( HAS_64_BIT )
+    endif (WIN32)
+
+    if (UNIX)
+        if ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexa64")
+        else ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexglx")
+        endif ( HAS_64_BIT )
+    endif (UNIX)
+
+    if (APPLE)
+        if ( HAS_64_BIT )
+            SET(MATLAB_SUFFIX ".mexmaci64")
+        endif ( HAS_64_BIT )
+    endif(APPLE)
+
+    if ( MKL_FOUND )
+
+        # coil map 2D
+        add_library(Matlab_compute_coil_map_2D SHARED Matlab_compute_coil_map_2D.cpp)
+        SET_TARGET_PROPERTIES(Matlab_compute_coil_map_2D PROPERTIES SUFFIX ${MATLAB_SUFFIX})
+        install(TARGETS Matlab_compute_coil_map_2D DESTINATION bin )
+
+        # coil map 3D
+        add_library(Matlab_compute_coil_map_3D SHARED Matlab_compute_coil_map_3D.cpp)
+        SET_TARGET_PROPERTIES(Matlab_compute_coil_map_3D PROPERTIES SUFFIX ${MATLAB_SUFFIX})
+        install(TARGETS Matlab_compute_coil_map_3D DESTINATION bin )
+
+    endif ( MKL_FOUND )
+
+else(MATLAB_FOUND)
+    message("MATLAB NOT FOUND: matlab wrapper for gtplus toolbox will not be compiled.")
+endif(MATLAB_FOUND)
diff --git a/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_2D.cpp b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_2D.cpp
new file mode 100644
index 0000000..7b2d8d3
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_2D.cpp
@@ -0,0 +1,136 @@
+
+#include <matrix.h>
+#include <mat.h>
+#ifdef _WIN32
+    #include <mexGT.h>
+#else
+    #include <mex.h>
+#endif // _WIN32
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtMatlabConverter.h"
+#include "gtMatlabConverterComplex.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+static void usage()
+{
+    using namespace std;
+    std::ostrstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: compute_coil_map_2D \n";
+    outs << "6 Input paras:" << endl;
+    outs << '\t' << "complexIm  : RO*E1*CHA*N, 2D complex image array, in complex float" << endl;
+    outs << '\t' << "algo       : ISMRMRD_SOUHEIL or ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "ks         : kernel size, used by both methods" << endl;
+    outs << '\t' << "power      : number of times to perform power method, used by ISMRMRD_SOUHEIL" << endl;
+    outs << '\t' << "iterNum    : number of maximal iteration times, used by ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "thres      : threshold of iteration, used by ISMRMRD_SOUHEIL_ITER" << endl;
+
+    outs << "1 Output para:" << endl;
+    outs << '\t' << "coilMap    : RO*E1*CHA*N coil map" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+
+    mexPrintf("%s\n", outs.str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != 6) 
+        {
+            mexWarnMsgTxt("6 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < 1 )
+        {
+            mexWarnMsgTxt("1 output argument is required ...");
+            usage();
+            return;
+        }
+
+        typedef std::complex<float> ValueType;
+
+        Gadgetron::GadgetronTimer timer("Running coil map estimation");
+
+        Gadgetron::gtMatlabConverter<float> converter;
+        Gadgetron::gtMatlabConverterComplex<ValueType> converterComplex;
+
+        Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // target
+        if ( !mxIsSingle(prhs[0]) || !mxIsComplex(prhs[0]) )
+        {
+            mexWarnMsgTxt("The first input parameter should be a complex single array ...");
+        }
+
+        mwSize nDim = mxGetNumberOfDimensions(prhs[0]);
+        if ( nDim!=3 && nDim!=4 )
+        {
+            mexWarnMsgTxt("1st array is not a 3D or 4D array");
+            return;
+        }
+
+        const mwSize* dims = mxGetDimensions(prhs[0]);
+
+        // algo
+        Gadgetron::gtPlus::ISMRMRDCOILMAPALGO algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL_ITER;
+        std::string algoStr;
+        converter.Matlab2Str(prhs[1], algoStr);
+        if ( algoStr == "ISMRMRD_SOUHEIL" )
+        {
+            algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL;
+        }
+
+        // ks
+        unsigned long long ks = mxGetScalar(prhs[2]);
+
+        // power
+        unsigned long long power = mxGetScalar(prhs[3]);
+
+        // iterNum
+        unsigned long long iterNum = (unsigned long long)(mxGetScalar(prhs[4]));
+
+        // iterNum
+        float thres = (float)(mxGetScalar(prhs[5]));
+
+        // ---------------------------------------------------------------
+        // perform the computation
+        // ---------------------------------------------------------------
+        Gadgetron::hoNDArray<ValueType> complexIm;
+        converterComplex.Matlab2hoNDArray(prhs[0], complexIm);
+
+        Gadgetron::hoNDArray<ValueType> coilMap;
+
+        if ( !gtPlus_util_complex_.coilMap2DNIH(complexIm, coilMap, algo, ks, power, iterNum, thres, true) )
+        {
+            mexWarnMsgTxt("coilMap2DNIH(...) failed ... ");
+            return;
+        }
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        mxArray* coilMapMx = NULL;
+        converterComplex.hoNDArray2Matlab(coilMap, coilMapMx);
+        plhs[0] = coilMapMx;
+   }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab compute_coil_map_2D() ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_3D.cpp b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_3D.cpp
new file mode 100644
index 0000000..ae6bded
--- /dev/null
+++ b/apps/standalone/cpu/gtplus/Matlab_compute_coil_map_3D.cpp
@@ -0,0 +1,137 @@
+
+#include <matrix.h>
+#include <mat.h>
+
+#ifdef _WIN32
+    #include <mexGT.h>
+#else
+    #include <mex.h>
+#endif // _WIN32
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtMatlabConverter.h"
+#include "gtMatlabConverterComplex.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+static void usage()
+{
+    using namespace std;
+    std::ostrstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: compute_coil_map_3D \n";
+    outs << "6 Input paras:" << endl;
+    outs << '\t' << "complexIm  : RO*E1*E2*CHA*N, 3D complex image array, in complex float" << endl;
+    outs << '\t' << "algo       : ISMRMRD_SOUHEIL or ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "ks         : kernel size, used by both methods" << endl;
+    outs << '\t' << "power      : number of times to perform power method, used by ISMRMRD_SOUHEIL" << endl;
+    outs << '\t' << "iterNum    : number of maximal iteration times, used by ISMRMRD_SOUHEIL_ITER" << endl;
+    outs << '\t' << "thres      : threshold of iteration, used by ISMRMRD_SOUHEIL_ITER" << endl;
+
+    outs << "1 Output para:" << endl;
+    outs << '\t' << "coilMap    : RO*E1*E2*CHA*N coil map" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+
+    mexPrintf("%s\n", outs.str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != 6) 
+        {
+            mexWarnMsgTxt("6 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < 1 )
+        {
+            mexWarnMsgTxt("1 output argument is required ...");
+            usage();
+            return;
+        }
+
+        typedef std::complex<float> ValueType;
+
+        Gadgetron::GadgetronTimer timer("Running coil map estimation");
+
+        Gadgetron::gtMatlabConverter<float> converter;
+        Gadgetron::gtMatlabConverterComplex<ValueType> converterComplex;
+
+        Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // target
+        if ( !mxIsSingle(prhs[0]) || !mxIsComplex(prhs[0]) )
+        {
+            mexWarnMsgTxt("The first input parameter should be a complex single array ...");
+        }
+
+        mwSize nDim = mxGetNumberOfDimensions(prhs[0]);
+        if ( nDim!=4 && nDim!=5 )
+        {
+            mexWarnMsgTxt("1st array is not a 4D or 5D array");
+            return;
+        }
+
+        const mwSize* dims = mxGetDimensions(prhs[0]);
+
+        // algo
+        Gadgetron::gtPlus::ISMRMRDCOILMAPALGO algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL_ITER;
+        std::string algoStr;
+        converter.Matlab2Str(prhs[1], algoStr);
+        if ( algoStr == "ISMRMRD_SOUHEIL" )
+        {
+            algo = Gadgetron::gtPlus::ISMRMRD_SOUHEIL;
+        }
+
+        // ks
+        unsigned long long ks = mxGetScalar(prhs[2]);
+
+        // power
+        unsigned long long power = mxGetScalar(prhs[3]);
+
+        // iterNum
+        unsigned long long iterNum = (unsigned long long)(mxGetScalar(prhs[4]));
+
+        // iterNum
+        float thres = (float)(mxGetScalar(prhs[5]));
+
+        // ---------------------------------------------------------------
+        // perform the computation
+        // ---------------------------------------------------------------
+        Gadgetron::hoNDArray<ValueType> complexIm;
+        converterComplex.Matlab2hoNDArray(prhs[0], complexIm);
+
+        Gadgetron::hoNDArray<ValueType> coilMap;
+
+        if ( !gtPlus_util_complex_.coilMap3DNIH(complexIm, coilMap, algo, ks, power, iterNum, thres) )
+        {
+            mexWarnMsgTxt("coilMap3DNIH(...) failed ... ");
+            return;
+        }
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        mxArray* coilMapMx = NULL;
+        converterComplex.hoNDArray2Matlab(coilMap, coilMapMx);
+        plhs[0] = coilMapMx;
+   }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab compute_coil_map_3D() ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/registration/2d/CMakeLists.txt b/apps/standalone/cpu/registration/2d/CMakeLists.txt
new file mode 100644
index 0000000..e476fb4
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/CMakeLists.txt
@@ -0,0 +1,40 @@
+add_executable(register_HS_2d_cpu register_HS_2d.cpp)
+add_executable(register_CK_2d_cpu register_CK_2d.cpp)
+
+target_link_libraries(register_HS_2d_cpu 
+  hostutils 
+  cpureg 
+  cpucore 
+  cpucore_math
+  ${ARMADILLO_LIBRARIES}
+  )
+
+target_link_libraries(register_CK_2d_cpu 
+  hostutils 
+  cpureg 
+  cpucore
+  cpucore_math
+  ${ARMADILLO_LIBRARIES}
+  )
+
+install(TARGETS 
+  register_HS_2d_cpu
+  register_CK_2d_cpu 
+  DESTINATION bin)
+
+# matlab wrapper
+if (MATLAB_FOUND)
+  message("Matlab found> ${MATLAB_INCLUDE_DIR}. Matlab registration wrapper is being compiled.")
+  SET(CMAKE_DEBUG_POSTFIX)
+  if (WIN32)
+    include_directories( ${MATLAB_INCLUDE_DIR} )
+    add_library(Matlab_register_CK_2d_cpu SHARED Matlab_register_CK_2d.cpp)
+    target_link_libraries(Matlab_register_CK_2d_cpu ${MATLAB_LIBRARIES} hostutils cpureg cpucore cpucore_math)
+    if ( HAS_64_BIT )				
+      SET_TARGET_PROPERTIES(Matlab_register_CK_2d_cpu PROPERTIES SUFFIX .mexw64)
+    endif ( HAS_64_BIT )    
+    install(TARGETS Matlab_register_CK_2d_cpu DESTINATION bin )
+  endif (WIN32)
+else(MATLAB_FOUND)
+  message("Matlab not found. Matlab wrapper for registration toolbox will not be compiled.")
+endif(MATLAB_FOUND)
diff --git a/apps/standalone/cpu/registration/2d/Matlab_register_CK_2d.cpp b/apps/standalone/cpu/registration/2d/Matlab_register_CK_2d.cpp
new file mode 100644
index 0000000..65e1f93
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/Matlab_register_CK_2d.cpp
@@ -0,0 +1,197 @@
+
+#include <matrix.h>
+#include <mat.h>
+#include <mexGT.h>
+#include <cmath>
+#include <vector>
+#include <iostream>
+#include <strstream>
+
+// Gadgetron includes
+#include "hoCKOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "GadgetronTimer.h"
+#include "parameterparser.h"
+
+#define MEXPRINTF(name) mexPrintf(#name);
+
+static void usage()
+{
+    using namespace std;
+    std::ostrstream outs;
+
+    outs << "==============================================================================================" << endl;
+    outs << "Usage: register_CK_2d \n";
+    outs << "5 Input paras:" << endl;
+    outs << '\t' << "target     : Nfe*Npe, 2D array, target (fixed) image, in double" << endl;
+    outs << '\t' << "source     : Nfe*Npe, 2D array, source (moving) image, in double" << endl;
+    outs << '\t' << "alpha      : regularization parameter, alpha" << endl;
+    outs << '\t' << "beta       : regularization parameter, beta" << endl;
+    outs << '\t' << "level      : number of resolution levels" << endl;
+
+    outs << "2 Output para:" << endl;
+    outs << '\t' << "dx         : deformation field, along 1st dimension" << endl;
+    outs << '\t' << "dy         : deformation field, along 2nd dimension" << endl;
+    outs << "==============================================================================================" << endl;
+    outs << std::ends; 
+    
+    mexPrintf("%s\n", outs.str() );
+}
+
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+    try
+    {
+        // ---------------------------------------------------------------
+        // consistency check
+        // ---------------------------------------------------------------    
+        if (nrhs != 5) 
+        {
+            mexWarnMsgTxt("5 input arguments are required ...");
+            usage();
+            return;
+        }
+
+        if (nlhs < 2 )
+        {
+            mexWarnMsgTxt("2 output argument is required ...");
+            usage();
+            return;
+        }
+
+        Gadgetron::GadgetronTimer timer("Running registration");
+
+        // ---------------------------------------------------------------
+        // input parameters
+        // ---------------------------------------------------------------    
+        // target
+        if ( !mxIsDouble(prhs[0]) )
+        {
+            mexWarnMsgTxt("The first input parameter should be a double array ...");
+        }
+
+        if ( !mxIsDouble(prhs[1]) )
+        {
+            mexWarnMsgTxt("The second input parameter should be a double array ...");
+        }
+
+        // for the image
+        mwSize nDim = mxGetNumberOfDimensions(prhs[0]);
+        if ( nDim!=2 )
+        {
+            mexWarnMsgTxt("1st array is not a 2D array");
+            return;
+        }
+
+        nDim = mxGetNumberOfDimensions(prhs[1]);
+        if ( nDim!=2 )
+        {
+            mexWarnMsgTxt("2nd array is not a 2D array");
+            return;
+        }
+
+        const mwSize* dims = mxGetDimensions(prhs[0]);
+        int numOfPixels = dims[0]*dims[1];
+
+        const mwSize* dims2 = mxGetDimensions(prhs[1]);
+        if ( dims[0]!=dims2[0] || dims[1]!=dims2[1] )
+        {
+            mexWarnMsgTxt("Input arrays have different size ... ");
+            return;
+        }
+
+        double* ptrTarget = static_cast<double*>(mxGetData(prhs[0]));
+        double* ptrSource = static_cast<double*>(mxGetData(prhs[1]));
+
+        // alpha
+        double alpha = mxGetScalar(prhs[2]);
+
+        // beta
+        double beta = mxGetScalar(prhs[3]);
+
+        // level
+        int level = (int)(mxGetScalar(prhs[4]));
+
+        // ---------------------------------------------------------------
+        // perform the registration
+        // ---------------------------------------------------------------
+        // allocate the results
+        mxArray* Dx = mxCreateNumericArray(nDim, dims, mxDOUBLE_CLASS, mxREAL);
+        if ( Dx == NULL )
+        {
+            mexWarnMsgTxt("Dx == NULL");
+            return;
+        }
+
+        mxArray* Dy = mxCreateNumericArray(nDim, dims, mxDOUBLE_CLASS, mxREAL);
+        if ( Dy == NULL )
+        {
+            mexWarnMsgTxt("Dy == NULL");
+            return;
+        }
+
+        double* ptrDx = static_cast<double*>(mxGetData(Dx));
+        double* ptrDy = static_cast<double*>(mxGetData(Dy));
+        memset(ptrDx, 0, sizeof(double)*numOfPixels);
+        memset(ptrDy, 0, sizeof(double)*numOfPixels);
+
+        // allocate the target and source images
+        typedef double _real;
+        using namespace Gadgetron;
+
+        std::vector<size_t> dim_array(2);
+        dim_array[0] = dims[0];
+        dim_array[1] = dims[1];
+
+        boost::shared_ptr< hoNDArray<_real> > fixed_image(new hoNDArray<_real>(&dim_array));
+        memcpy(fixed_image->begin(), ptrTarget, sizeof(_real)*numOfPixels);
+
+        boost::shared_ptr< hoNDArray<_real> > moving_image(new hoNDArray<_real>(&dim_array));
+        memcpy(moving_image->begin(), ptrSource, sizeof(_real)*numOfPixels);
+
+        boost::shared_ptr< hoLinearResampleOperator<_real,2> > R( new hoLinearResampleOperator<_real,2>() );
+
+        // Setup solver
+        hoCKOpticalFlowSolver<_real,2> CK;
+        CK.set_interpolator( R );
+        CK.set_output_mode( hoCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );
+        CK.set_num_multires_levels( level );
+        CK.set_max_num_iterations_per_level( 500 );
+        CK.set_alpha(alpha);
+        CK.set_beta(beta);
+        CK.set_limit(0.01f);
+  
+        // Run registration
+        //
+        boost::shared_ptr< hoNDArray<_real> > result;
+
+        {
+            Gadgetron::GadgetronTimer timer("Running registration - solve");
+            result = CK.solve( fixed_image.get(), moving_image.get() );
+        }
+
+        if( !result.get() )
+        {
+            mexWarnMsgTxt("Registration solver failed. Quitting!");
+            return;
+        }
+
+        memcpy(ptrDx, result->begin(), sizeof(_real)*numOfPixels);
+        memcpy(ptrDy, result->begin()+numOfPixels, sizeof(_real)*numOfPixels);
+
+        // ---------------------------------------------------------------
+        // output parameter
+        // ---------------------------------------------------------------
+        plhs[0] = Dx;
+        plhs[1] = Dy;
+   }
+    catch(...)
+    {
+        mexWarnMsgTxt("Exceptions happened in Matlab register_CK_2d() ...");
+        return;
+    }
+
+    return;
+}
diff --git a/apps/standalone/cpu/registration/2d/register_CK_2d.cpp b/apps/standalone/cpu/registration/2d/register_CK_2d.cpp
new file mode 100644
index 0000000..ea129a9
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/register_CK_2d.cpp
@@ -0,0 +1,121 @@
+/*
+  An example of how to register two 2d images using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "hoCKOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "GadgetronTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > fixed_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > moving_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !fixed_image.get() || !moving_image.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_fixed_dims = fixed_image->get_number_of_dimensions();
+  size_t num_moving_dims = moving_image->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< hoLinearResampleOperator<_real,2> > R( new hoLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  hoCKOpticalFlowSolver<_real,2> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( hoCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  CK.set_num_multires_levels( 4 );
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > result;
+  {
+    GadgetronTimer timer("Running registration");
+    result = CK.solve( fixed_image.get(), moving_image.get() );
+  }
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+
+  boost::shared_ptr< hoNDArray<_real> > deformed_moving;
+  {
+    GadgetronTimer timer("Applying deformation");
+    deformed_moving = CK.deform( moving_image.get(), result );
+  }
+  
+  // All done, write out the result
+  //
+
+  write_nd_array<_real>(result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  write_nd_array<_real>(deformed_moving.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/registration/2d/register_HS_2d.cpp b/apps/standalone/cpu/registration/2d/register_HS_2d.cpp
new file mode 100644
index 0000000..057bbbc
--- /dev/null
+++ b/apps/standalone/cpu/registration/2d/register_HS_2d.cpp
@@ -0,0 +1,110 @@
+/*
+  An example of how to register two 2d images using Horn-Schunk optical flow
+*/
+
+// Gadgetron includes
+#include "hoHSOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.1" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > fixed_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+  
+  boost::shared_ptr< hoNDArray<_real> > moving_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !moving_image.get() || !fixed_image.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_fixed_dims = fixed_image->get_number_of_dimensions();
+  size_t num_moving_dims = moving_image->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+    
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< hoLinearResampleOperator<_real,2> > R( new hoLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  hoHSOpticalFlowSolver<_real,2> HS;
+  HS.set_interpolator( R );
+  HS.set_output_mode( hoHSOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  HS.set_num_multires_levels( 4 );
+  HS.set_max_num_iterations_per_level( 500 );
+  HS.set_alpha(alpha);
+  HS.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > result = HS.solve( fixed_image.get(), moving_image.get() );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< hoNDArray<_real> > deformed_moving = HS.deform( moving_image.get(), result );
+  
+  // All done, write out the result
+  //
+
+  write_nd_array<_real>(result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  write_nd_array<_real>(deformed_moving.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/registration/3d/CMakeLists.txt b/apps/standalone/cpu/registration/3d/CMakeLists.txt
new file mode 100644
index 0000000..0053c52
--- /dev/null
+++ b/apps/standalone/cpu/registration/3d/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(register_CK_3d_cpu register_CK_3d.cpp)
+
+target_link_libraries(register_CK_3d_cpu
+  hostutils 
+  cpureg 
+  cpucore 
+  cpucore_math
+  ${ARMADILLO_LIBRARIES}
+  )
+
+install(TARGETS register_CK_3d_cpu DESTINATION bin)
diff --git a/apps/standalone/cpu/registration/3d/register_CK_3d.cpp b/apps/standalone/cpu/registration/3d/register_CK_3d.cpp
new file mode 100644
index 0000000..a9bd14a
--- /dev/null
+++ b/apps/standalone/cpu/registration/3d/register_CK_3d.cpp
@@ -0,0 +1,115 @@
+/*
+  An example of how to register two 3d volumes using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "hoCKOpticalFlowSolver.h"
+#include "hoLinearResampleOperator.h"
+#include "hoNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > fixed_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > moving_image = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !fixed_image.get() || !moving_image.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  size_t num_fixed_dims = fixed_image->get_number_of_dimensions();
+  size_t num_moving_dims = moving_image->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 3 || num_fixed_dims == 4)  ){
+    cout << endl << "The fixed image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 3 || num_moving_dims == 4)  ){
+    cout << endl << "The moving image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use trilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< hoLinearResampleOperator<_real,3> > R( new hoLinearResampleOperator<_real,3>() );
+
+  // Setup solver
+  //
+  
+  hoCKOpticalFlowSolver<_real,3> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( hoCKOpticalFlowSolver<_real,3>::OUTPUT_VERBOSE );  
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_num_multires_levels( multires_levels );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > result = CK.solve( fixed_image.get(), moving_image.get() );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< hoNDArray<_real> > deformed_moving = CK.deform( moving_image.get(), result );
+  
+  // All done, write out the result
+  //
+
+  write_nd_array<_real>(result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  write_nd_array<_real>(deformed_moving.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/cpu/registration/CMakeLists.txt b/apps/standalone/cpu/registration/CMakeLists.txt
new file mode 100644
index 0000000..79e23c9
--- /dev/null
+++ b/apps/standalone/cpu/registration/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+  )
+
+if(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+add_subdirectory(2d)
+add_subdirectory(3d)
+endif(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
diff --git a/apps/standalone/gpu/CMakeLists.txt b/apps/standalone/gpu/CMakeLists.txt
new file mode 100644
index 0000000..c66f3f6
--- /dev/null
+++ b/apps/standalone/gpu/CMakeLists.txt
@@ -0,0 +1,19 @@
+include_directories( 
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR} 
+  ${ACE_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu 
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  )
+
+add_subdirectory(mri)
+add_subdirectory(denoising)
+add_subdirectory(deblurring)
+add_subdirectory(registration)
diff --git a/apps/standalone/gpu/deblurring/2d/CMakeLists.txt b/apps/standalone/gpu/deblurring/2d/CMakeLists.txt
new file mode 100644
index 0000000..7ef916a
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/CMakeLists.txt
@@ -0,0 +1,14 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+add_executable(blur_2d blur_2d.cpp)
+add_executable(deblur_2d_cg deblur_2d_cg.cpp)
+add_executable(deblur_2d_sb deblur_2d_sb.cpp)
+
+target_link_libraries(deblur_2d_cg gpucore hostutils gpuoperators gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(deblur_2d_sb gpucore hostutils gpuoperators gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(blur_2d gpucore hostutils gpuoperators gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS blur_2d deblur_2d_cg deblur_2d_sb DESTINATION bin)
+
diff --git a/apps/standalone/gpu/deblurring/2d/blur_2d.cpp b/apps/standalone/gpu/deblurring/2d/blur_2d.cpp
new file mode 100644
index 0000000..d8a8009
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/blur_2d.cpp
@@ -0,0 +1,111 @@
+/*
+  Example code to blur an image and generate input data for the deblurring apps.
+*/
+
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "parameterparser.h"
+#include "cuConvolutionOperator.h"
+
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Output kernel image file name (.cplx)", true, "kernel_image.cplx" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load image from disk (single precision assumed)
+  boost::shared_ptr< hoNDArray<float> > _host_image = 
+    read_nd_array<float>((char*)parms.get_parameter('d')->get_string_value());
+
+  if( !(_host_image->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input image is not two-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  // Convert to _real
+  hoNDArray<_real> host_image; host_image.create(_host_image->get_dimensions().get()); 
+  for( unsigned int i=0; i<host_image.get_number_of_elements(); i++ )
+    host_image.get_data_ptr()[i] = (_real) _host_image->get_data_ptr()[i];
+    
+  // Upload host image to device, normalize, and convert to complex type
+  cuNDArray<_real> _image(&host_image);
+  normalize( &_image, _real(1) );
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  
+  // Setup resulting blurred image
+  cuNDArray<_complext> blurred_image; 
+  blurred_image.create(image->get_dimensions().get());
+  
+  // Generate convolution kernel (just do this on the host for now)
+  _real sigma = 2.5;
+  hoNDArray<_real> host_kernel;
+  host_kernel.create(image->get_dimensions().get());
+  for( unsigned int y=0; y<image->get_size(1); y++ ){
+    for( unsigned int x=0; x<image->get_size(0); x++ ){
+      _real biasx = (_real)(image->get_size(0)>>1);
+      _real biasy = (_real)(image->get_size(1)>>1);
+      _real cx = (_real)x-biasx;
+      _real cy = (_real)y-biasy;
+      host_kernel.get_data_ptr()[y*image->get_size(0)+x] = 1.0/(2.0*M_PI*sigma*sigma)*exp(-1.0*((cx*cx)/(2.0*sigma*sigma)+(cy*cy)/(2.0*sigma*sigma)));
+    }
+  }
+
+  cuNDArray<_real> _kernel(&host_kernel);
+  boost::shared_ptr< cuNDArray<_complext> > kernel = real_to_complex<_complext>( &_kernel );
+
+  // Normalize kernel
+  _real scale = asum(kernel.get());
+  *kernel /= scale;
+
+  // Create convolution operator and assign kernel
+  cuConvolutionOperator<_real,2> conv;
+  conv.set_kernel( kernel.get() );  
+
+  // Convolve
+  conv.mult_M( image.get(), &blurred_image );
+
+  //
+  // Output result
+  //
+  
+  boost::shared_ptr< hoNDArray<_complext> > blurred_image_host = blurred_image.to_host();
+  write_nd_array<_complext>( blurred_image_host.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&blurred_image)->to_host();
+  write_nd_array<_real>( host_norm.get(), "blurred_image.real" );
+
+  boost::shared_ptr< hoNDArray<_complext> > kernel_image_host = kernel->to_host();
+  write_nd_array<_complext>( kernel_image_host.get(), (char*)parms.get_parameter('k')->get_string_value());
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp b/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp
new file mode 100644
index 0000000..cc236ca
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp
@@ -0,0 +1,109 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "cg_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "25" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Regularization weight", true, "0.1" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 2) || !(host_kernel->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  _real kappa = (_real) parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  
+  // Setup regularization operators
+  //
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Rx( new cuPartialDerivativeOperator<_complext,2>(0) );
+  Rx->set_weight( kappa );
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Ry( new cuPartialDerivativeOperator<_complext,2>(1) );
+  Ry->set_weight( kappa );
+     
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,2> > E( new cuConvolutionOperator<_real,2>() );
+  E->set_kernel( &kernel );
+  E->set_domain_dimensions(data.get_dimensions().get());
+
+  // Setup conjugate gradient solver
+  cuCgSolver< _complext> cg;
+  cg.set_encoding_operator( E );                         // encoding matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Rx );  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Ry );  // regularization matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-12 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+                  
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult = cg.solve( &data );
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = cgresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "cg_deblurred_image.real" );  
+
+  return 0;
+}
+
diff --git a/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp b/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp
new file mode 100644
index 0000000..8a1824e
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp
@@ -0,0 +1,129 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuSbcCgSolver.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "sb_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "50" );
+  parms.add_parameter( 'M', COMMAND_LINE_FLOAT,  1, "Mu", true, "100.0" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "100.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 2) || !(host_kernel->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Rx( new cuPartialDerivativeOperator<_complext,2>(0) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> > Ry( new cuPartialDerivativeOperator<_complext,2>(1) );
+  
+  _real mu = (_real) parms.get_parameter('M')->get_float_value();
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,2> > E( new cuConvolutionOperator<_real,2>() );  
+  E->set_kernel( &kernel );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+
+  // Setup split-Bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_encoding_operator( E );
+  sb.add_regularization_group_operator( Rx ); 
+  sb.add_regularization_group_operator( Ry ); 
+  sb.add_group();
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( cuSbcCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+
+  // Run split-Bregman solver
+  boost::shared_ptr< cuNDArray<_complext> > sbresult = sb.solve(&data);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = sbresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(sbresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "sb_deblurred_image.real" );  
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/3d/CMakeLists.txt b/apps/standalone/gpu/deblurring/3d/CMakeLists.txt
new file mode 100644
index 0000000..1d674bc
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/CMakeLists.txt
@@ -0,0 +1,13 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+add_executable(blur_3d blur_3d.cpp)
+add_executable(deblur_3d_cg deblur_3d_cg.cpp)
+add_executable(deblur_3d_sb deblur_3d_sb.cpp)
+
+target_link_libraries(deblur_3d_cg gpucore hostutils gpuoperators gpusolvers gpunfft ${CUDA_LIBRARIES})
+target_link_libraries(deblur_3d_sb gpucore hostutils gpuoperators gpusolvers gpunfft ${CUDA_LIBRARIES})
+target_link_libraries(blur_3d gpucore hostutils gpuoperators gpusolvers gpunfft ${CUDA_LIBRARIES})
+
+install(TARGETS blur_3d deblur_3d_cg deblur_3d_sb DESTINATION bin)
diff --git a/apps/standalone/gpu/deblurring/3d/blur_3d.cpp b/apps/standalone/gpu/deblurring/3d/blur_3d.cpp
new file mode 100644
index 0000000..6942c76
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/blur_3d.cpp
@@ -0,0 +1,113 @@
+/*
+  Example code to blur an image and generate input data for the deblurring apps.
+*/
+
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_elemwise.h"
+#include "parameterparser.h"
+#include "cuConvolutionOperator.h"
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "In kernel image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'K', COMMAND_LINE_STRING, 1, "Output kernel file name (.cplx)", true, "kernel_image.cplx" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load image and kernel from disk (single precision assumed)
+  //
+  boost::shared_ptr< hoNDArray<float> > _host_image = 
+    read_nd_array<float>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<float> > _host_kernel = 
+    read_nd_array<float>((char*)parms.get_parameter('k')->get_string_value());
+
+  if( !(_host_image->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input image is not three-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  if( !(_host_kernel->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input kernel is not three-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+
+  // Convert image and kernel to _real
+  //
+  hoNDArray<_real> host_image; host_image.create(_host_image->get_dimensions().get()); 
+  for( unsigned int i=0; i<host_image.get_number_of_elements(); i++ )
+    host_image.get_data_ptr()[i] = (_real) _host_image->get_data_ptr()[i];
+    
+  hoNDArray<_real> host_kernel; host_kernel.create(_host_kernel->get_dimensions().get()); 
+  for( unsigned int i=0; i<host_kernel.get_number_of_elements(); i++ )
+    host_kernel.get_data_ptr()[i] = (_real) _host_kernel->get_data_ptr()[i];
+
+  // Upload host image/kernel and convert to complex type
+  //
+  cuNDArray<_real> _image(&host_image);
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  
+  cuNDArray<_real> _kernel(&host_kernel);
+  boost::shared_ptr< cuNDArray<_complext> > kernel = real_to_complex<_complext>( &_kernel );
+
+  // Normalize kernel
+  _real scale = asum(kernel.get());
+  *kernel /= scale;
+
+  // Setup resulting blurred image
+  cuNDArray<_complext> blurred_image;
+  blurred_image.create(image->get_dimensions().get());
+  
+  // Create convolution operator and assign kernel
+  cuConvolutionOperator<_real,3> conv;
+  conv.set_kernel( kernel.get() );  
+
+  // Convolve
+  conv.mult_M( image.get(), &blurred_image );
+
+  //
+  // Output result
+  //
+  
+  boost::shared_ptr< hoNDArray<_complext> > blurred_image_host = blurred_image.to_host();
+  write_nd_array<_complext>( blurred_image_host.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&blurred_image)->to_host();
+  write_nd_array<_real>( host_norm.get(), "blurred_image.real" );
+
+  boost::shared_ptr< hoNDArray<_complext> > kernel_image_host = kernel->to_host();
+  write_nd_array<_complext>( kernel_image_host.get(), (char*)parms.get_parameter('K')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm_kernel = abs(kernel.get())->to_host();
+  write_nd_array<_real>( host_norm_kernel.get(), "kernel_image.real" );
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp b/apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp
new file mode 100644
index 0000000..d75fe64
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/deblur_3d_cg.cpp
@@ -0,0 +1,114 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "cg_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "25" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Regularization weight", true, "0.1" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 3) || !(host_kernel->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  _real kappa = (_real) parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+
+  Rx->set_weight( kappa );
+  Ry->set_weight( kappa );
+  Rz->set_weight( kappa );
+     
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,3> > E( new cuConvolutionOperator<_real,3>() );
+  E->set_kernel( &kernel );
+  E->set_domain_dimensions(data.get_dimensions().get());
+    
+  // Setup conjugate gradient solver
+  cuCgSolver<_complext> cg;
+  cg.set_encoding_operator( E );                         // encoding matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Rx );  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Ry );  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Rz );  // regularization matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-12 );
+  cg.set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+                
+  // Form right hand side
+  cuNDArray<_complext> rhs; rhs.create(data.get_dimensions().get());
+  E->mult_MH( &data, &rhs );
+  
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult = cg.solve_from_rhs(&rhs);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = cgresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "cg_deblurred_image.real" );  
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp b/apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp
new file mode 100644
index 0000000..348e640
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/3d/deblur_3d_sb.cpp
@@ -0,0 +1,135 @@
+/*
+  Deblurring using conjugate gradient solver.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuSbcCgSolver.h"
+#include "cuCgSolver.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuConvolutionOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Blurred image file name (.cplx)", true, "blurred_image.cplx" );
+  parms.add_parameter( 'k', COMMAND_LINE_STRING, 1, "Kernel image file name (.cplx)", true, "kernel_image.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "sb_deblurred_image.cplx" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "50" );
+  parms.add_parameter( 'M', COMMAND_LINE_FLOAT,  1, "Mu", true, "1.0" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "1.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running deblurring with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_complext> > host_data = 
+    read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_complext> > host_kernel = 
+    read_nd_array<_complext>((char*)parms.get_parameter('k')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 3) || !(host_kernel->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data (image/kernel) is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+  
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> > Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+  
+  _real mu = (_real) parms.get_parameter('M')->get_float_value();
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+
+  Rz->set_weight( lambda );
+  Rz->set_domain_dimensions(data.get_dimensions().get());
+  Rz->set_codomain_dimensions(data.get_dimensions().get());
+
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,3> > E( new cuConvolutionOperator<_real,3>() );  
+  E->set_kernel( &kernel );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+  
+  // Setup split-Bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_encoding_operator( E );
+  sb.add_regularization_group_operator( Rx ); 
+  sb.add_regularization_group_operator( Ry ); 
+  sb.add_group();
+  sb.add_regularization_operator( Rz ); 
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( cuSbcCgSolver< _complext>::OUTPUT_VERBOSE );
+
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-8 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+
+  // Run split-Bregman solver
+  boost::shared_ptr< cuNDArray<_complext> > sbresult = sb.solve(&data);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = sbresult->to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(sbresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "sb_deblurred_image.real" );  
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/deblurring/CMakeLists.txt b/apps/standalone/gpu/deblurring/CMakeLists.txt
new file mode 100644
index 0000000..5550044
--- /dev/null
+++ b/apps/standalone/gpu/deblurring/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(2d)
+add_subdirectory(3d)
diff --git a/apps/standalone/gpu/denoising/2d/CMakeLists.txt b/apps/standalone/gpu/denoising/2d/CMakeLists.txt
new file mode 100644
index 0000000..f4f6e68
--- /dev/null
+++ b/apps/standalone/gpu/denoising/2d/CMakeLists.txt
@@ -0,0 +1,9 @@
+if (WIN32)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+add_executable(denoise_TV denoise_TV.cpp)
+
+target_link_libraries(denoise_TV gpucore hostutils gpusolvers gpuoperators ${CUDA_LIBRARIES})
+
+install(TARGETS denoise_TV DESTINATION bin)
diff --git a/apps/standalone/gpu/denoising/2d/denoise_TV.cpp b/apps/standalone/gpu/denoising/2d/denoise_TV.cpp
new file mode 100644
index 0000000..92c26c3
--- /dev/null
+++ b/apps/standalone/gpu/denoising/2d/denoise_TV.cpp
@@ -0,0 +1,122 @@
+/*
+  Total variation denoising based on the paper 
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+*/
+
+// Gadgetron includes
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuSbCgSolver.h"
+#include "cuCgSolver.h"
+#include "identityOperator.h"
+#include "cuPartialDerivativeOperator.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Noisy image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "denoised_image_TV.real" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "20" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "10" );
+  parms.add_parameter( 'm', COMMAND_LINE_FLOAT,  1, "Regularization weight (mu)", true, "25.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running denoising with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+    
+  // Load sample data from disk
+  boost::shared_ptr< hoNDArray<_real> > host_data = 
+    read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+
+  if( !host_data.get() ){
+    cout << endl << "Input image not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( host_data->get_number_of_dimensions() != 2 ){
+    cout << endl << "Input image is not two-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  // Upload host data to device
+  cuNDArray<_real> data(host_data.get());
+  
+  _real mu = (_real) parms.get_parameter('m')->get_float_value();
+  _real lambda = (_real)2.0*mu; // This is a good alround setting according to Goldstein et al.
+
+  if( mu <= (_real) 0.0 ) {
+    cout << endl << "Regularization parameter mu should be strictly positive. Quitting!\n" << endl;
+    return 1;
+  }
+
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_outer_iterations = parms.get_parameter('O')->get_int_value();
+  
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_real,2> > Rx( new cuPartialDerivativeOperator<_real,2>(0) );
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_real,2> > Ry( new cuPartialDerivativeOperator<_real,2>(1) );
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+
+  // Define encoding operator (identity)
+  boost::shared_ptr< identityOperator<cuNDArray<_real> > > E( new identityOperator<cuNDArray<_real> >() );
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+    
+  // Setup split-Bregman solver
+  cuSbCgSolver<_real> sb;
+  sb.set_encoding_operator( E );
+  //sb.add_regularization_operator( Rx ); // Anisotropic denoising
+  //sb.add_regularization_operator( Ry ); // Anisotropic denoising
+  sb.add_regularization_group_operator( Rx ); // Isotropic denoising
+  sb.add_regularization_group_operator( Ry); // Isotropic denoising
+  sb.add_group();
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+  sb.set_output_mode( cuCgSolver<_real>::OUTPUT_VERBOSE );
+  
+  // Setup inner conjugate gradient solver
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_real>::OUTPUT_WARNINGS );
+
+  // Run split-Bregman solver
+  boost::shared_ptr< cuNDArray<_real> > sbresult = sb.solve(&data);
+
+  // All done, write out the result
+  boost::shared_ptr< hoNDArray<_real> > host_result = sbresult->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/denoising/CMakeLists.txt b/apps/standalone/gpu/denoising/CMakeLists.txt
new file mode 100644
index 0000000..5c4cec9
--- /dev/null
+++ b/apps/standalone/gpu/denoising/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(2d)
diff --git a/apps/standalone/gpu/mri/CMakeLists.txt b/apps/standalone/gpu/mri/CMakeLists.txt
new file mode 100644
index 0000000..866ed62
--- /dev/null
+++ b/apps/standalone/gpu/mri/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(nfft)
+add_subdirectory(sense)
diff --git a/apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt b/apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt
new file mode 100644
index 0000000..50c12ba
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/CMakeLists.txt
@@ -0,0 +1,17 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+)
+
+add_executable(nfft main_nfft.cpp)
+add_executable(nffth main_nffth.cpp)
+#add_executable(moco moco.cpp)
+add_executable(nffth_cg main_cg.cpp)
+add_executable(nffth_sb main_sb.cpp)
+
+target_link_libraries(nfft gpucore gpuoperators gpunfft hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth gpucore gpuoperators gpunfft hostutils ${CUDA_LIBRARIES})
+#target_link_libraries(moco gpusolvers gpureg gpucore gpuparallelmri gpuoperators gpunfft hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_cg gpusolvers gpuoperators gpucore gpunfft hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_sb gpusolvers gpuoperators gpucore gpunfft hostutils ${CUDA_LIBRARIES})
+
+#install(TARGETS nfft moco nffth nffth_cg nffth_sb DESTINATION bin)
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_cg.cpp b/apps/standalone/gpu/mri/nfft/2d/main_cg.cpp
new file mode 100644
index 0000000..76d4685
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_cg.cpp
@@ -0,0 +1,138 @@
+/*
+  
+  Sample application of the NFFT toolbox: using the NFFT matrix operator in a conjugate gradient solver
+  
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "vector_td_utilities.h"
+#include "parameterparser.h"
+#include "cuNFFTOperator.h"
+#include "cuCgSolver.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples ndarray is not two-dimensional (samples/profile x #profiles). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+
+  unsigned int num_profiles = host_samples->get_size(1);
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  // Upload host data to device
+  timer = new GPUTimer("Uploading samples to device");
+  cuNDArray<_complext> samples(host_samples.get());
+  delete timer;
+  
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  // Define and setup NFFT encoding operator
+  boost::shared_ptr< cuNFFTOperator<_real,2> > E( new cuNFFTOperator<_real,2>() );
+  
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw);
+  
+  // Set image dimensions
+  vector<size_t> image_dims = to_std_vector(matrix_size);
+  E->set_domain_dimensions(&image_dims);
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  E->preprocess( traj.get() );
+  delete timer;
+
+  // Setup conjugate gradient solver
+  cuCgSolver< _complext> cg;
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+  cg.set_encoding_operator( E); 
+
+  // Solve
+  boost::shared_ptr< cuNDArray<_complext> > cgresult;
+  {
+    GPUTimer timer("GPU Conjugate Gradient solve");
+    cgresult = cg.solve(&samples);
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_image = cgresult->to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp b/apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp
new file mode 100644
index 0000000..c80f060
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_nfft.cpp
@@ -0,0 +1,148 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "inverse gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+
+  - transform arbitrary trajectories
+  - transform an "arbitrary" number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs from an single precision input image ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name (.cplx)", true, "samples.cplx" );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Number of profiles", true );
+  parms.add_parameter( 's', COMMAND_LINE_INT,    1, "Samples per profiles", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load image from disk
+  timer = new GPUTimer("Loading image from disk");
+  boost::shared_ptr< hoNDArray<_real> > host_image = read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_image->get_number_of_dimensions() == 2) ){
+    cout << endl << "Input image is not two-dimensional. Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  unsigned int num_profiles = parms.get_parameter('p')->get_int_value();
+  unsigned int samples_per_profile = parms.get_parameter('s')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+
+  uint64d2 matrix_size = from_std_vector<size_t,2>(*(host_image->get_dimensions().get()));
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  if( matrix_size.vec[0] != matrix_size.vec[1] ){
+    cout << endl << "For this samples application we only allow square input images. "
+	 << endl << "The only reason being that only one oversampled matrix size is specified and the oversampling ratio must be consistent." << endl;
+  }
+    
+  // Upload host image to device, normalize, and convert to complex type
+  timer = new GPUTimer("Uploading, normalizing and converting to complex");
+  cuNDArray<_real> _image(host_image.get());
+  normalize( &_image, 1.0f );
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  delete timer;
+  
+  // Setup resulting samples array
+  vector<size_t> samples_dims; samples_dims.push_back( samples_per_profile ); samples_dims.push_back( num_profiles );
+  cuNDArray<_complext> samples(&samples_dims);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  plan.preprocess( traj.get(), plan_type::NFFT_PREP_C2NC );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha,_real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  // Gridder
+  timer = new GPUTimer("Computing nfft");
+  plan.compute( image.get(), &samples, dcw.get(), plan_type::NFFT_FORWARDS_C2NC );
+  delete timer;
+
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = samples.to_host();
+  write_nd_array<_complext>( host_samples.get(), (char*)parms.get_parameter('r')->get_string_value());
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp b/apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp
new file mode 100644
index 0000000..611db0d
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_nffth.cpp
@@ -0,0 +1,145 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs a gridded image from input ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples ndarray is not two-dimensional (samples/profile x #profiles). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+
+  unsigned int num_profiles = host_samples->get_size(1);
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  // Upload host data to device
+  timer = new GPUTimer("Uploading samples to device");
+  cuNDArray<_complext> samples(host_samples.get());
+  delete timer;
+  
+  // Setup resulting image array
+  vector<size_t> image_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> image(&image_dims);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  plan.preprocess( traj.get(), plan_type::NFFT_PREP_NC2C );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  // Gridder
+  timer = new GPUTimer("Computing adjoint nfft (gridding)");
+  plan.compute( &samples, &image, dcw.get(), plan_type::NFFT_BACKWARDS_NC2C );
+  delete timer;
+
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_image = image.to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&image)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/2d/main_sb.cpp b/apps/standalone/gpu/mri/nfft/2d/main_sb.cpp
new file mode 100644
index 0000000..6771a1b
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/2d/main_sb.cpp
@@ -0,0 +1,170 @@
+/*
+  
+  Sample application of the NFFT toolbox: using the NFFT matrix operator in a Split Bregman solver
+  
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "parameterparser.h"
+#include "cuNFFTOperator.h"
+#include "cuSbcCgSolver.h"
+#include "vector_td_utilities.h"
+#include "cuPartialDerivativeOperator.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of inner iterations", true, "10" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of outer iterations", true, "10" );
+  parms.add_parameter( 'l', COMMAND_LINE_FLOAT,  1, "Regularization weight (lambda)", true, "1.0" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 2) ){
+    cout << endl << "Samples ndarray is not two-dimensional (samples/profile x #profiles). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_sb_iterations = parms.get_parameter('I')->get_int_value();
+
+  unsigned int num_profiles = host_samples->get_size(1);
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+  _real lambda = (_real)parms.get_parameter('l')->get_float_value();
+  
+  // Upload host data to device
+  timer = new GPUTimer("Uploading samples to device");
+  cuNDArray<_complext> samples(host_samples.get());
+  delete timer;
+
+  // Reshape the data array to a one-dimensional array (we have no batch dimension)
+  std::vector<size_t> sample_dims;
+  sample_dims.push_back(samples.get_number_of_elements());
+  
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, num_profiles,  1 );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, num_profiles, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  // Define and setup NFFT encoding operator
+  boost::shared_ptr< cuNFFTOperator<_real,2> > E( new cuNFFTOperator<_real,2>() );
+  E->set_weight(lambda);
+
+   E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw);
+  
+  // Set image dimensions
+  vector<size_t> image_dims = to_std_vector(matrix_size);
+  E->set_domain_dimensions(&image_dims);
+  E->set_codomain_dimensions(&sample_dims);
+
+  // Setup regularization operators
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> >
+    Rx( new cuPartialDerivativeOperator<_complext,2>(0) );
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(&image_dims);
+  Rx->set_codomain_dimensions(&image_dims);
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,2> >
+    Ry( new cuPartialDerivativeOperator<_complext,2>(1) );
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(&image_dims);
+  Ry->set_codomain_dimensions(&image_dims);
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  E->preprocess( traj.get() );
+  delete timer;
+
+  // Setup split bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_max_outer_iterations( num_sb_iterations );
+  sb.set_max_inner_iterations( 1 );
+  sb.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+
+  sb.set_encoding_operator( E); 
+  sb.add_regularization_group_operator( Rx ); 
+  sb.add_regularization_group_operator( Ry ); 
+  sb.add_group();
+
+  // Setup inner conjugate gradient solver
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  
+  // Solve
+  boost::shared_ptr< cuNDArray<_complext> > cgresult;
+  {
+    GPUTimer timer("GPU Conjugate Gradient solve");
+    cgresult = sb.solve(&samples);
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_image = cgresult->to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(cgresult.get())->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/CMakeLists.txt b/apps/standalone/gpu/mri/nfft/CMakeLists.txt
new file mode 100644
index 0000000..68dd4c7
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(2d)
+add_subdirectory(ms2d)
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt b/apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt
new file mode 100644
index 0000000..5f2705c
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(nfft_ms nfft_main.cpp)
+add_executable(nffth_ms nffth_main.cpp)
+
+target_link_libraries(nfft_ms gpucore gpunfft hostutils ${CUDA_LIBRARIES})
+target_link_libraries(nffth_ms gpucore gpunfft hostutils ${CUDA_LIBRARIES})
+
+install(TARGETS nfft_ms nffth_ms DESTINATION bin)
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp b/apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp
new file mode 100644
index 0000000..26a6d47
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/nfft_main.cpp
@@ -0,0 +1,148 @@
+/*
+  Sample application of the NFFT toolbox: standalone "inverse gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs from a single precision multislice input image ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "samples.cplx" );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "#profiles/frame", true );
+  parms.add_parameter( 's', COMMAND_LINE_INT,    1, "#samples/profile", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "#frames/reconstruction (a negative value means all)", true, "-1" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load image from disk
+  timer = new GPUTimer("Loading image from disk");
+  boost::shared_ptr< hoNDArray<_real> > host_image = read_nd_array<_real>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_image->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input image is not three-dimensional (2d multislice). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int samples_per_profile = parms.get_parameter('s')->get_int_value();  
+  int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+
+  uint64d2 matrix_size = from_std_vector<size_t,2>(*(host_image->get_dimensions().get()));
+  unsigned int num_frames = host_image->get_size(2);
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  if( matrix_size.vec[0] != matrix_size.vec[1] ){
+    cout << endl << "For this samples application we only allow square input images. "
+	 << endl << "The only reason being that only one oversampled matrix size is specified and the oversampling ratio must be consistent." << endl;
+  }
+
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_frames;
+  if( (unsigned int)frames_per_reconstruction > num_frames ) frames_per_reconstruction = num_frames;
+  
+  // Upload host image to device, normalize, and convert to complex type
+  timer = new GPUTimer("Uploading, normalizing and converting to complex");
+  cuNDArray<_real> _image(host_image.get());
+  normalize( &_image, 1.0f );
+  boost::shared_ptr< cuNDArray<_complext> > image = real_to_complex<_complext>( &_image );
+  delete timer;
+  
+  // Setup resulting samples array
+  vector<size_t> samples_dims; 
+  samples_dims.push_back( samples_per_profile ); samples_dims.push_back( profiles_per_frame ); samples_dims.push_back(frames_per_reconstruction);
+  cuNDArray<_complext> samples; samples.create(&samples_dims);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute trajectories
+  timer = new GPUTimer("Computing golden ratio radial trajectories");
+  boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>( samples_per_profile, profiles_per_frame, frames_per_reconstruction );
+  delete timer;
+  
+  // Preprocess
+  timer = new GPUTimer("NFFT preprocessing");
+  plan.preprocess( traj.get(), plan_type::NFFT_PREP_C2NC );
+  delete timer;
+
+  // Gridder
+  timer = new GPUTimer("Computing nfft");
+  plan.compute( image.get(), &samples, 0x0, plan_type::NFFT_FORWARDS_C2NC );
+  delete timer;
+
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = samples.to_host();
+  write_nd_array<_complext>( host_samples.get(), (char*)parms.get_parameter('r')->get_string_value() );
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp b/apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp
new file mode 100644
index 0000000..13b8713
--- /dev/null
+++ b/apps/standalone/gpu/mri/nfft/ms2d/nffth_main.cpp
@@ -0,0 +1,173 @@
+/*
+
+  Sample application of the NFFT toolbox: standalone "gridding" example.
+
+  -----------
+
+  The nfft is written generically and templetized to
+  - transform arbitrary trajectories
+  - transform an arbitrary number of dimensions (currently instantiated for 1d/2d/3d/4d)
+  - support both single and double precision
+
+  General principles of the implementation can be found in:
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+
+  This example programme of the nnft utilizes golden ratio based radial trajectories 
+  and outputs gridded images from 2D multislice input ndarrays of the corresponding samples, trajectory, and density compensation weights.
+
+*/
+
+#include "cuNFFT.h"
+#include "radial_utilities.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+#include "complext.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+typedef cuNFFT_plan<_real,2> plan_type;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction,
+	     hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  cudaMemcpy( data->get_data_ptr(), 
+	      host_data->get_data_ptr()+reconstruction*samples_per_reconstruction, 
+	      samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+  
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main( int argc, char** argv) 
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Input samples file name (.cplx)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Output image file name (.cplx)", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "#frames/reconstruction (a negative value means all)", true, "-1" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("Loading samples from disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_samples = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_samples->get_number_of_dimensions() == 3) ){
+    cout << endl << "Samples ndarray is not three-dimensional (samples/profile x #profiles/frame x #frames). Quitting.\n" << endl;
+    return 1;
+  }
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();  
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  
+  unsigned int samples_per_profile = host_samples->get_size(0);  
+  unsigned int profiles_per_frame = host_samples->get_size(1);
+  unsigned int num_frames = host_samples->get_size(2);  
+  
+  unsigned int profiles_per_reconstruction = profiles_per_frame*frames_per_reconstruction;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+  _real alpha = (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0];
+
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_frames;
+  if( (unsigned int)frames_per_reconstruction > num_frames ) frames_per_reconstruction = num_frames;
+  
+  // Setup resulting image array
+  vector<size_t> image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back((num_frames/frames_per_reconstruction)*frames_per_reconstruction);
+  cuNDArray<_complext> image(&image_dims);
+  clear(&image);
+  
+  // Initialize plan
+  timer = new GPUTimer("Initializing plan");
+  plan_type plan( matrix_size, matrix_size_os, kernel_width );
+  delete timer;
+
+  // Compute density compensation weights
+  timer = new GPUTimer("Computing density compensation weights");
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, alpha, _real(1)/((_real)samples_per_profile/(_real)matrix_size.vec[0]) );
+  delete timer;
+
+  for( unsigned int iteration = 0; iteration < num_frames/frames_per_reconstruction; iteration++ ) {
+    
+    // Compute trajectories
+    timer = new GPUTimer("Computing golden ratio radial trajectories");
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, iteration*profiles_per_reconstruction );
+    delete timer;
+    
+    // Preprocess
+    timer = new GPUTimer("NFFT preprocessing");
+    plan.preprocess( traj.get(), plan_type::NFFT_PREP_NC2C );
+    delete timer;
+    
+    // Upload data
+    timer = new GPUTimer("Upload data");
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( iteration, samples_per_reconstruction, host_samples.get() );
+    
+    vector<size_t> image_dims = to_std_vector(matrix_size); 
+    image_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp_image; tmp_image.create(&image_dims, image.get_data_ptr()+iteration*prod(matrix_size)*frames_per_reconstruction);
+
+    // Gridder
+    timer = new GPUTimer("Computing adjoint nfft (gridding)");
+    plan.compute( data.get(), &tmp_image, dcw.get(), plan_type::NFFT_BACKWARDS_NC2C );
+    delete timer;
+  }
+  
+  //
+  // Output result
+  //
+  
+  timer = new GPUTimer("Output result to disk");
+  boost::shared_ptr< hoNDArray<_complext> > host_image = image.to_host();
+  write_nd_array<_complext>( host_image.get(), (char*)parms.get_parameter('d')->get_string_value() );
+  write_nd_array<_real>( abs(&image)->to_host().get(), "result.real" );
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/CMakeLists.txt b/apps/standalone/gpu/mri/sense/CMakeLists.txt
new file mode 100644
index 0000000..11e472d
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/CMakeLists.txt
@@ -0,0 +1,2 @@
+#add_subdirectory(cartesian)
+add_subdirectory(noncartesian)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/CMakeLists.txt
new file mode 100644
index 0000000..968542a
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories( 
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+)
+
+add_subdirectory(radial)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/.gitignore b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/.gitignore
new file mode 100644
index 0000000..7e4edfd
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/.gitignore
@@ -0,0 +1 @@
+radial_sense
\ No newline at end of file
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/CMakeLists.txt
new file mode 100644
index 0000000..9c78221
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(radial_sense_cg main_cg.cpp)
+add_executable(radial_sense_sbc main_sbc.cpp)
+add_executable(radial_sense_gpbb main_gpbb.cpp)
+MESSAGE("CUDA LIBRARIES:  ${CUDA_LIBRARIES}")
+target_link_libraries(radial_sense_cg gpuoperators cpucore gpucore gpuparallelmri gpunfft hostutils gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(radial_sense_sbc gpuoperators cpucore gpucore gpuparallelmri gpunfft hostutils gpusolvers ${CUDA_LIBRARIES})
+target_link_libraries(radial_sense_gpbb gpuoperators cpucore gpucore gpuparallelmri gpunfft hostutils gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS radial_sense_cg radial_sense_sbc DESTINATION bin)
+install(TARGETS radial_sense_sbc radial_sense_sbc DESTINATION bin)
+install(TARGETS radial_sense_gpbb radial_sense_sbc DESTINATION bin)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_cg.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_cg.cpp
new file mode 100644
index 0000000..e62b605
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_cg.cpp
@@ -0,0 +1,289 @@
+// Gadgetron includes
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "hoNDArray_fileio.h"
+#include "vector_td_utilities.h"
+#include "cuImageOperator.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuCgSolver.h"
+#include "b1_map.h"
+#include "parameterparser.h"
+#include "GPUTimer.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+const bool use_atomics = false;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Kappa", true, "0.3" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real kappa = parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction: " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction: " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction: " << samples_per_reconstruction << endl << endl;
+
+  // Set density compensation weights
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2,use_atomics> > E
+    ( new cuNonCartesianSenseOperator<_real,2,use_atomics>() );  
+
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw) ;
+
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2,use_atomics> > rhs_buffer
+    ( new cuSenseBuffer<_real,2,use_atomics>() );
+
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+
+  // Fill rhs buffer
+  //
+
+  timer = new GPUTimer("Filling rhs buffer");
+    
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+
+  delete timer;
+
+
+  // Estimate CSM
+  //
+
+  timer = new GPUTimer("Estimating csm");
+
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );  
+  E->set_csm(csm);
+
+  delete timer;
+  
+
+  // Define regularization image operator 
+  //
+
+  timer = new GPUTimer("Computing regularization");
+
+  std::vector<size_t> image_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> reg_image = cuNDArray<_complext>(&image_dims);
+
+  E->mult_csm_conj_sum( acc_images.get(), &reg_image );
+  acc_images.reset();
+
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() );
+  R->set_weight( kappa );
+  R->compute( &reg_image );
+
+  delete timer;
+
+  // Define preconditioning weights
+  //
+
+  timer = new GPUTimer("Computing preconditioning weights");
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+  *R_diag *= kappa;
+  *_precon_weights += *R_diag;
+  R_diag.reset();
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  delete timer;
+  
+  // 
+  // Setup radial SENSE reconstructions
+  //
+      
+  // Setup conjugate gradient solver
+  cuCgSolver<_complext> cg;
+  cg.set_preconditioner ( D );  // preconditioning matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+  
+  // Reconstruct all SENSE frames iteratively
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+  
+  // Allocate space for result
+  image_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+  cuNDArray<_complext> result = cuNDArray<_complext>(&image_dims);
+  
+  timer = new GPUTimer("Full SENSE reconstruction.");
+  
+  // Define image dimensions
+  image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back(frames_per_reconstruction);
+  
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // Determine trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Pass image dimensions to encoding operator
+    E->set_domain_dimensions(&image_dims);
+    E->set_codomain_dimensions(data->get_dimensions().get());
+  
+    // Set current trajectory and trigger NFFT preprocessing
+    E->preprocess(traj.get());
+    
+    //
+    // Invoke conjugate gradient solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > cgresult;
+    {
+      GPUTimer timer("GPU Conjugate Gradient solve");
+      cgresult = cg.solve(data.get());
+    }
+
+    if( !cgresult.get() )
+      return 1;
+
+    // Copy cgresult to overall result
+    cuNDArray<_complext> out(&image_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );    
+    out = *(cgresult.get());
+  }
+  
+  delete timer;
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+  
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_gpbb.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_gpbb.cpp
new file mode 100644
index 0000000..745017a
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_gpbb.cpp
@@ -0,0 +1,284 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuGpBbSolver.h"
+#include "cuTvOperator.h"
+#include "cuTvPicsOperator.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real;
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> >
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction,
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction,
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "10" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "2e-7" );
+  parms.add_parameter( 'A', COMMAND_LINE_FLOAT,  1, "Alpha in [0;1] (for PICS)", true, "0.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+
+  GPUTimer *timer;
+
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+  _real alpha = (_real) parms.get_parameter('A')->get_float_value();
+
+  if( alpha>1 ) alpha = 1;
+  if( alpha<0 ) alpha = 0;
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0],
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+  E->set_dcw(dcw);
+
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+
+  //
+  // Compute CSM using accumulation in the rhs buffer
+  //
+
+  timer = new GPUTimer("CSM and regularization estimation");
+
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+
+  // Estimate csm
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  *acc_images *= rhs_buffer->get_normalization_factor();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+  E->set_csm(csm);
+
+  std::vector<size_t> reg_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> _reg_image = cuNDArray<_complext>(&reg_dims);
+  E->mult_csm_conj_sum( acc_images.get(), &_reg_image );
+
+  // Duplicate the regularization image to 'frames_per_reconstruction' frames
+  boost::shared_ptr<cuNDArray<_complext> > reg_image = expand( &_reg_image, frames_per_reconstruction );
+
+  acc_images.reset();
+
+  // Define preconditioning weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  boost::shared_ptr< std::vector<size_t> > recon_dims( new std::vector<size_t> );
+  *recon_dims = to_std_vector(matrix_size); recon_dims->push_back(frames_per_reconstruction);
+
+  delete timer;
+
+  //
+  // Setup radial SENSE reconstructions
+  //
+
+  vector<size_t> data_dims;
+  data_dims.push_back(samples_per_reconstruction); data_dims.push_back(num_coils);
+
+  E->set_domain_dimensions(recon_dims.get());
+  E->set_codomain_dimensions(&data_dims);
+
+  // Setup split-Bregman solver
+  cuGpBbSolver<_complext> solver;
+
+  // Add "TV" regularization
+  if( (alpha<1.0f) && (lambda>0.0f)){
+    boost::shared_ptr<cuTvOperator<_complext,3> > TV(new cuTvOperator<_complext,3>);
+    TV->set_weight(lambda*(1.0f-alpha));
+    solver.add_nonlinear_operator(TV);
+  }
+
+  // Add "PICS" regularization
+  boost::shared_ptr<cuTvPicsOperator<_complext,3> > PICS;
+  if( (alpha>0.0f) && (lambda>0.0f)){
+    PICS = boost::shared_ptr<cuTvPicsOperator<_complext,3> >(new cuTvPicsOperator<_complext,3>);
+    PICS->set_weight(lambda*alpha);
+    PICS->set_prior(reg_image);
+    solver.add_nonlinear_operator(PICS);
+  }
+
+  solver.set_encoding_operator( E );
+  solver.set_preconditioner ( D );
+  solver.set_max_iterations( num_iterations );
+  solver.set_output_mode( cuGpBbSolver<_complext>::OUTPUT_VERBOSE );
+  solver.set_x0( reg_image );
+
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+  // Allocate space for result
+  std::vector<size_t> res_dims = to_std_vector(matrix_size);
+  res_dims.push_back(frames_per_reconstruction*num_reconstructions);
+  cuNDArray<_complext> result = cuNDArray<_complext>(&res_dims);
+
+  timer = new GPUTimer("Full SENSE reconstruction with TV regularization.");
+
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // Determine trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+    // Set current trajectory and trigger NFFT preprocessing
+    E->preprocess(traj.get());
+
+    //
+    // Split-Bregman solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > solve_result;
+    {
+      GPUTimer timer("GPU constrained Split Bregman solve");
+      solve_result = solver.solve(data.get());
+    }
+
+    vector<size_t> tmp_dims = to_std_vector(matrix_size); tmp_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp(&tmp_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+    // Copy sbresult to result (pointed to by tmp)
+    tmp = *solve_result;
+  }
+
+  delete timer;
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_sbc.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_sbc.cpp
new file mode 100644
index 0000000..6a7b229
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio/main_sbc.cpp
@@ -0,0 +1,332 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuCgSolver.h"
+#include "cuSbcCgSolver.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+ 
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction (negative meaning all)", true, "-1" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of cg iterations", true, "10" );
+  parms.add_parameter( 'I', COMMAND_LINE_INT,    1, "Number of sb inner iterations", true, "1" );
+  parms.add_parameter( 'O', COMMAND_LINE_INT,    1, "Number of sb outer iterations", true, "10" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'M', COMMAND_LINE_FLOAT,  1, "Mu", true, "1.0" );
+  parms.add_parameter( 'L', COMMAND_LINE_FLOAT,  1, "Lambda", true, "2.0" );
+  parms.add_parameter( 'A', COMMAND_LINE_FLOAT,  1, "Alpha in [0;1] (for PICCS)", true, "0.5" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  unsigned int num_cg_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int num_sb_inner_iterations = parms.get_parameter('I')->get_int_value();
+  unsigned int num_sb_outer_iterations = parms.get_parameter('O')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  _real mu = (_real) parms.get_parameter('M')->get_float_value();
+  _real lambda = (_real) parms.get_parameter('L')->get_float_value();
+  _real alpha = (_real) parms.get_parameter('A')->get_float_value();
+
+  if( alpha>1 ) alpha = 1;
+  if( alpha<0 ) alpha = 0;
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+  
+  // Define encoding matrix for non-Cartesian SENSE
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );  
+  E->set_weight( mu );
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+  E->set_dcw(dcw);
+
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+
+  //
+  // Compute CSM using accumulation in the rhs buffer
+  // 
+ 
+  timer = new GPUTimer("CSM and regularization estimation");
+    
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+    
+  // Estimate csm
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  *acc_images *= rhs_buffer->get_normalization_factor();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+  E->set_csm(csm);
+
+  std::vector<size_t> reg_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> _reg_image = cuNDArray<_complext>(&reg_dims);
+  E->mult_csm_conj_sum( acc_images.get(), &_reg_image );
+
+  // Duplicate the regularization image to 'frames_per_reconstruction' frames
+  boost::shared_ptr<cuNDArray<_complext> > reg_image = expand( &_reg_image, frames_per_reconstruction );
+
+  acc_images.reset();
+
+  // Define preconditioning weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+
+  boost::shared_ptr< std::vector<size_t> > recon_dims( new std::vector<size_t> );
+  *recon_dims = to_std_vector(matrix_size); recon_dims->push_back(frames_per_reconstruction); 
+
+  // Define regularization operators 
+  // We need "a pair" for PICCS
+  //
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rx( new cuPartialDerivativeOperator<_complext,3>(0) );
+  Rx->set_weight( (1.0f-alpha)*lambda );
+  Rx->set_domain_dimensions(recon_dims.get());
+  Rx->set_codomain_dimensions(recon_dims.get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Ry( new cuPartialDerivativeOperator<_complext,3>(1) );
+  Ry->set_weight( (1.0f-alpha)*lambda );
+  Ry->set_domain_dimensions(recon_dims.get());
+  Ry->set_codomain_dimensions(recon_dims.get());
+ 
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rz( new cuPartialDerivativeOperator<_complext,3>(2) );
+  Rz->set_weight( (1.0f-alpha)*lambda );
+  Rz->set_domain_dimensions(recon_dims.get());
+  Rz->set_codomain_dimensions(recon_dims.get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rx2( new cuPartialDerivativeOperator<_complext,3>(0) );
+  Rx2->set_weight( alpha*lambda );
+  Rx2->set_domain_dimensions(recon_dims.get());
+  Rx2->set_codomain_dimensions(recon_dims.get());
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Ry2( new cuPartialDerivativeOperator<_complext,3>(1) );
+  Ry2->set_weight( alpha*lambda );
+  Ry2->set_domain_dimensions(recon_dims.get());
+  Ry2->set_codomain_dimensions(recon_dims.get());
+ 
+  boost::shared_ptr< cuPartialDerivativeOperator<_complext,3> >
+    Rz2( new cuPartialDerivativeOperator<_complext,3>(2) );
+  Rz2->set_weight( alpha*lambda );
+  Rz2->set_domain_dimensions(recon_dims.get());
+  Rz2->set_codomain_dimensions(recon_dims.get());
+
+  delete timer;
+    
+  // 
+  // Setup radial SENSE reconstructions
+  //
+
+  vector<size_t> data_dims; 
+  data_dims.push_back(samples_per_reconstruction); data_dims.push_back(num_coils);
+
+  E->set_domain_dimensions(recon_dims.get());
+  E->set_codomain_dimensions(&data_dims);
+
+  // Setup split-Bregman solver
+  cuSbcCgSolver<_complext> sb;
+  sb.set_encoding_operator( E );
+  
+  // Add "TV" regularization
+  if( alpha<1.0 ){
+    sb.add_regularization_group_operator( Rx ); 
+    sb.add_regularization_group_operator( Ry ); 
+    sb.add_regularization_group_operator( Rz ); 
+    sb.add_group();
+  }
+  
+  // Add "PICCS" regularization
+  if( alpha > 0.0 ){
+    sb.add_regularization_group_operator( Rx2 ); 
+    sb.add_regularization_group_operator( Ry2 ); 
+    sb.add_regularization_group_operator( Rz2 ); 
+    sb.add_group(reg_image);
+  }
+  
+  sb.set_max_outer_iterations(num_sb_outer_iterations);
+  sb.set_max_inner_iterations(num_sb_inner_iterations);
+  sb.set_output_mode( cuSbcCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  sb.get_inner_solver()->set_preconditioner ( D );
+  sb.get_inner_solver()->set_max_iterations( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode( cuCgSolver<_complext>::OUTPUT_WARNINGS );
+  
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+  // Allocate space for result
+  std::vector<size_t> res_dims = to_std_vector(matrix_size); 
+  res_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+  cuNDArray<_complext> result = cuNDArray<_complext>(&res_dims);
+
+  timer = new GPUTimer("Full SENSE reconstruction with TV regularization.");
+
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // Determine trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Set current trajectory and trigger NFFT preprocessing
+    E->preprocess(traj.get());
+        
+    //
+    // Split-Bregman solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > sbresult;
+    {
+      GPUTimer timer("GPU constrained Split Bregman solve");
+      sbresult = sb.solve(data.get());
+    }
+
+    vector<size_t> tmp_dims = to_std_vector(matrix_size); tmp_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> tmp(&tmp_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+    // Copy sbresult to result (pointed to by tmp)
+    tmp = *sbresult;
+  }
+  
+  delete timer;
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+  
+  delete timer;
+
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/CMakeLists.txt
new file mode 100644
index 0000000..70c502d
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/CMakeLists.txt
@@ -0,0 +1,31 @@
+include( ${QT_USE_FILE} )
+
+#We need binary and source dirs in this because of the header files created by the make process
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${OPENGL_INCLUDE_DIR}
+  ${GLUT_INCLUDE_DIR}
+  ${GLEW_INCLUDE_DIR}
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR} 
+)
+
+set(UI_UIFILES reconBaseWidget.ui radialSenseAppBaseMainWidget.ui)
+qt4_wrap_ui( UI_HEADERS ${UI_UIFILES} )
+ 
+set(UI_MOC_HEADERS radialSenseAppMainWidget.h reconWidget.h GLReconWidget.h)
+qt4_wrap_cpp (UI_MOC_OUTFILES ${UI_MOC_HEADERS})
+
+add_executable(radial_sense_gr_gui main.cpp ${UI_MOC_OUTFILES}
+radialSenseAppMainWidget.cpp reconWidget.cpp GLReconWidget.cpp ${UI_HEADERS} )
+
+target_link_libraries(radial_sense_gr_gui gpucore gpuparallelmri
+gpunfft hostutils gpusolvers gpuoperators ${CUDA_LIBRARIES} ${QT_QTGUI_LIBRARY} ${GLEW_LIBRARY}
+${QT_QTCORE_LIBRARY} ${QT_QTOPENGL_LIBRARY} ${OPENGL_gl_LIBRARY} )
+
+if (WIN32)
+set_target_properties( radial_sense_gr_gui PROPERTIES LINK_FLAGS "/FORCE:MULTIPLE") 
+endif (WIN32)
+
+install(TARGETS radial_sense_gr_gui DESTINATION bin)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.cpp
new file mode 100644
index 0000000..e195790
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.cpp
@@ -0,0 +1,222 @@
+#include <GL/glew.h>
+
+#include "GLReconWidget.h"
+#include "UIconstants.h"
+
+#include <cuda_runtime_api.h>
+#include <cuda_gl_interop.h>
+
+#include <stdio.h>
+
+//MSH: Ripped from cutil.h to remove dependency, replace
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } }
+
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+
+
+GLReconWidget::GLReconWidget(QWidget *parent) : QGLWidget(parent)
+{
+  cudaWidget = new cuGLReconWidget( MATRIX_SIZE_INITIAL_VALUE, MATRIX_SIZE_INITIAL_VALUE );
+}
+
+void GLReconWidget::setMatrixSize( unsigned int width, unsigned int height )
+{
+  cudaWidget->width = width;
+  cudaWidget->height = height;
+  
+  cudaWidget->initializePBO();
+}
+
+void GLReconWidget::initializeGL()
+{
+  glewInit();
+  
+  if (!glewIsSupported("GL_VERSION_2_0 GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) {
+    fprintf(stderr, "Required OpenGL extensions missing.");
+    exit(1);
+  }
+  
+  cudaWidget->initializePBO();
+}
+
+void GLReconWidget::paintGL()
+{
+  cudaWidget->display();
+}
+
+void GLReconWidget::resizeGL( int w, int h )
+{
+  glViewport(0, 0, w, h);
+  
+  glMatrixMode(GL_MODELVIEW);
+  glLoadIdentity();
+  
+  glMatrixMode(GL_PROJECTION);
+  glLoadIdentity();
+  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); 
+}
+
+void GLReconWidget::mapPBO()
+{
+  cudaWidget->mapPBO();
+}
+
+void GLReconWidget::unmapPBO()
+{
+  cudaWidget->unmapPBO();
+}
+
+float* GLReconWidget::getDevPtr()
+{
+  return cudaWidget->getDevPtr();
+}
+
+// shader for displaying floating-point texture
+static const char *shader_code = 
+  "!!ARBfp1.0\n"
+  "TEX result.color, fragment.texcoord, texture[0], 2D; \n"
+  "END";
+
+cuGLReconWidget::cuGLReconWidget( unsigned int width, unsigned int height )
+{
+  this->width = width;
+  this->height = height;
+  imageDevPtr = 0x0;
+  pbo = texid = shader = 0;
+}
+
+GLuint cuGLReconWidget::compileASMShader(GLenum program_type, const char *code)
+{
+  GLuint program_id;
+  glGenProgramsARB(1, &program_id);
+  glBindProgramARB(program_type, program_id);
+  glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei) strlen(code), (GLubyte *) code);
+
+  GLint error_pos;
+  glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos);
+  if (error_pos != -1) {
+    const GLubyte *error_string;
+    error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB);
+    fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string);
+    return 0;
+  }
+  return program_id;
+}
+
+void cuGLReconWidget::initializePBO()
+{
+  while( glGetError() != GL_NO_ERROR ){
+    printf("\nWARNING: glError detected prior to initialisePBO");
+    fflush(stdout);
+  }
+  
+  // Create pixel buffer object (PBO) to "render Cuda memory" through a texture
+  glGenBuffersARB(1, &pbo);
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+
+  // Initialize PBO with zero image
+  float *tmp = (float*) calloc( width*height, sizeof(float) );
+  glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(float), tmp, GL_STREAM_DRAW_ARB);
+
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+  CUDA_SAFE_CALL(cudaGLRegisterBufferObject(pbo));
+
+  // Create texture for display
+  glGenTextures(1, &texid);
+  glBindTexture(GL_TEXTURE_2D, texid);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE32F_ARB, width, height, 0, GL_LUMINANCE, GL_FLOAT, NULL);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
+    
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  // Load shader program
+  shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code);
+
+  while( glGetError() != GL_NO_ERROR ){
+    printf("\nWARNING: glError detected prior to initialiseOpenGL");
+    fflush(stdout);
+  }
+
+  free(tmp);
+}
+
+void cuGLReconWidget::mapPBO()
+{
+  if( width==0 || height == 0 ){
+    printf("\nWARNING: pbo buffer size is 0! Has initialiseOpenGL() been called?\n");
+  }
+
+  imageDevPtr = 0x0;
+  
+  // Map the PBO used for rendering to Cuda device memory
+  CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&imageDevPtr, pbo));
+  
+  if( !imageDevPtr ){
+    printf("\nWARNING: no pbo allocated for reconstruction result!\n");
+  }
+  
+  // Error check
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    printf("\nCuda error detected: %s\n", cudaGetErrorString(err) ); fflush(stdout);
+    exit(1);
+  }
+}
+
+void cuGLReconWidget::unmapPBO()
+{
+  // Unmap Cuda <-> PBO relation
+  CUDA_SAFE_CALL(cudaGLUnmapBufferObject(pbo));
+
+  // Error check
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    printf("\nCuda error detected: %s\n", cudaGetErrorString(err) ); fflush(stdout);
+    exit(1);
+  }
+}
+
+float* cuGLReconWidget::getDevPtr()
+{
+  return imageDevPtr;     
+}
+
+void cuGLReconWidget::display()
+{
+  // Clear window
+  glClear(GL_COLOR_BUFFER_BIT);
+
+  // Load texture from PBO
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+  glBindTexture(GL_TEXTURE_2D, texid);
+  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_LUMINANCE, GL_FLOAT, 0);
+  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+
+  // Use simple fragment program to display the floating point texture
+  glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader);
+  glEnable(GL_FRAGMENT_PROGRAM_ARB);
+  glDisable(GL_DEPTH_TEST);
+
+  // Render quad
+  glBegin(GL_QUADS);
+  {
+    glVertex2f(0, 1); glTexCoord2f(0, 1);
+    glVertex2f(0, 0); glTexCoord2f(0, 0);
+    glVertex2f(1, 0); glTexCoord2f(1, 0);
+    glVertex2f(1, 1); glTexCoord2f(1, 1);
+  }
+  glEnd();
+
+  // Restore original state
+  glBindTexture(GL_TEXTURE_2D, 0);
+  glDisable(GL_FRAGMENT_PROGRAM_ARB);
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.h
new file mode 100644
index 0000000..4c579d3
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/GLReconWidget.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "cuNDArray.h"
+
+#if defined (WIN32)
+#include <Windows.h>
+#endif
+
+#ifdef __MACH__
+#import <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif //__MACH__
+
+#include <QtOpenGL/QGLWidget>
+
+class cuGLReconWidget
+{
+public:
+  cuGLReconWidget( unsigned int width, unsigned int height );
+  
+  void initializePBO();
+  void mapPBO();
+  void unmapPBO();
+  float* getDevPtr();
+  void display();
+  
+  GLuint compileASMShader(GLenum program_type, const char *code);
+  
+  unsigned int width;
+  unsigned int height;
+  GLuint pbo;            // OpenGL pixel buffer object (map between Cuda and OpenGL)
+  GLuint texid;          // Texture (display pbo)
+  GLuint shader;         // Pixel shader for rendering of texture
+  float *imageDevPtr;    // This is the "exchange buffer" between Cuda and OpenGL
+};
+
+class GLReconWidget : public QGLWidget
+{
+  Q_OBJECT
+  
+  public:
+  GLReconWidget(QWidget* parent = 0);
+  void setMatrixSize( unsigned int width, unsigned int height );
+  void mapPBO();
+  void unmapPBO();
+  float* getDevPtr();
+
+protected:
+  void initializeGL();
+  void paintGL();
+  void resizeGL(int w, int h);
+    
+private:
+  cuGLReconWidget *cudaWidget;
+};
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/UIconstants.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/UIconstants.h
new file mode 100644
index 0000000..e9a0c9f
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/UIconstants.h
@@ -0,0 +1,10 @@
+#ifndef _UI_CONSTANTS
+#define _UI_CONSTANTS
+
+const unsigned int MATRIX_SIZE_INITIAL_VALUE = 192;
+const unsigned int MATRIX_SIZE_OS_INITIAL_VALUE = 256;
+const unsigned int NUM_ITERATIONS_INITIAL_VALUE = 15;
+const double REG_WEIGHT_INITIAL_VALUE = 0.01;
+const double KERNEL_SIZE_INITIAL_VALUE = 5.5;
+const unsigned int NUM_FRAMES_PER_CSM_RECON = 8;
+#endif
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/main.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/main.cpp
new file mode 100644
index 0000000..1cf22eb
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/main.cpp
@@ -0,0 +1,19 @@
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+
+#include "radialSenseAppMainWidget.h"
+
+#include <stdlib.h>
+
+#include <QtGui/QApplication>
+
+int
+main( int argc, char** argv) 
+{
+  QApplication app(argc, argv);
+  radialSenseAppMainWindow window;
+  window.show();
+  
+  return app.exec();
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppBaseMainWidget.ui b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppBaseMainWidget.ui
new file mode 100644
index 0000000..49a69e3
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppBaseMainWidget.ui
@@ -0,0 +1,572 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>radialSenseAppBaseMainWindow</class>
+ <widget class="QMainWindow" name="radialSenseAppBaseMainWindow">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>476</width>
+    <height>539</height>
+   </rect>
+  </property>
+  <property name="sizePolicy">
+   <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+    <horstretch>0</horstretch>
+    <verstretch>0</verstretch>
+   </sizepolicy>
+  </property>
+  <property name="windowTitle">
+   <string>Radial Sense GPU Reconstructor</string>
+  </property>
+  <property name="locale">
+   <locale language="English" country="UnitedKingdom"/>
+  </property>
+  <widget class="QWidget" name="centralwidget">
+   <widget class="ReconWidget" name="reconWidget" native="true">
+    <property name="geometry">
+     <rect>
+      <x>20</x>
+      <y>30</y>
+      <width>272</width>
+      <height>414</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="minimumSize">
+     <size>
+      <width>272</width>
+      <height>414</height>
+     </size>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label">
+    <property name="geometry">
+     <rect>
+      <x>340</x>
+      <y>10</y>
+      <width>81</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <weight>75</weight>
+      <bold>true</bold>
+      <underline>true</underline>
+     </font>
+    </property>
+    <property name="text">
+     <string>Matrix sizes</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_3">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>30</y>
+      <width>51</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Target</string>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="matrixSizeSpinBox">
+    <property name="enabled">
+     <bool>false</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>50</y>
+      <width>71</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="minimum">
+     <number>16</number>
+    </property>
+    <property name="maximum">
+     <number>512</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="value">
+     <number>16</number>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="oversampledMatrixSizeSpinBox">
+    <property name="enabled">
+     <bool>false</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>390</x>
+      <y>50</y>
+      <width>61</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="minimum">
+     <number>16</number>
+    </property>
+    <property name="maximum">
+     <number>512</number>
+    </property>
+    <property name="singleStep">
+     <number>0</number>
+    </property>
+    <property name="value">
+     <number>16</number>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_6">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>210</y>
+      <width>141</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Regularization weight</string>
+    </property>
+   </widget>
+   <widget class="QDoubleSpinBox" name="regularizationWeightSpinBox">
+    <property name="enabled">
+     <bool>true</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>230</y>
+      <width>71</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="decimals">
+     <number>3</number>
+    </property>
+    <property name="maximum">
+     <double>50.000000000000000</double>
+    </property>
+    <property name="singleStep">
+     <double>0.010000000000000</double>
+    </property>
+    <property name="value">
+     <double>0.000000000000000</double>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_4">
+    <property name="geometry">
+     <rect>
+      <x>380</x>
+      <y>30</y>
+      <width>81</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Oversampled</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_10">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>80</y>
+      <width>81</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Kernel width</string>
+    </property>
+   </widget>
+   <widget class="QDoubleSpinBox" name="kernelSizeSpinBox">
+    <property name="enabled">
+     <bool>true</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>100</y>
+      <width>51</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="decimals">
+     <number>2</number>
+    </property>
+    <property name="minimum">
+     <double>1.000000000000000</double>
+    </property>
+    <property name="maximum">
+     <double>15.000000000000000</double>
+    </property>
+    <property name="singleStep">
+     <double>0.500000000000000</double>
+    </property>
+    <property name="value">
+     <double>1.000000000000000</double>
+    </property>
+   </widget>
+   <widget class="Line" name="line_2">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>140</y>
+      <width>118</width>
+      <height>3</height>
+     </rect>
+    </property>
+    <property name="orientation">
+     <enum>Qt::Horizontal</enum>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_11">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>150</y>
+      <width>131</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <pointsize>12</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>Number of iterations</string>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="numIterationsSpinBox">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>170</y>
+      <width>51</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="minimum">
+     <number>1</number>
+    </property>
+    <property name="maximum">
+     <number>99</number>
+    </property>
+    <property name="singleStep">
+     <number>1</number>
+    </property>
+    <property name="value">
+     <number>1</number>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_12">
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>270</y>
+      <width>131</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="text">
+     <string>Window scale</string>
+    </property>
+   </widget>
+   <widget class="QDoubleSpinBox" name="windowScaleSpinBox">
+    <property name="enabled">
+     <bool>true</bool>
+    </property>
+    <property name="geometry">
+     <rect>
+      <x>310</x>
+      <y>290</y>
+      <width>62</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="decimals">
+     <number>1</number>
+    </property>
+    <property name="minimum">
+     <double>1.000000000000000</double>
+    </property>
+    <property name="maximum">
+     <double>10.000000000000000</double>
+    </property>
+    <property name="singleStep">
+     <double>0.250000000000000</double>
+    </property>
+    <property name="value">
+     <double>2.000000000000000</double>
+    </property>
+   </widget>
+  </widget>
+  <widget class="QMenuBar" name="menubar">
+   <property name="geometry">
+    <rect>
+     <x>0</x>
+     <y>0</y>
+     <width>476</width>
+     <height>22</height>
+    </rect>
+   </property>
+   <widget class="QMenu" name="menuFile">
+    <property name="title">
+     <string>File</string>
+    </property>
+    <addaction name="actionOpen_cplx_file"/>
+    <addaction name="separator"/>
+    <addaction name="actionSave_image"/>
+    <addaction name="separator"/>
+    <addaction name="actionClose"/>
+    <addaction name="separator"/>
+    <addaction name="actionExit"/>
+   </widget>
+   <widget class="QMenu" name="menuHelp">
+    <property name="title">
+     <string>Help</string>
+    </property>
+    <addaction name="separator"/>
+   </widget>
+   <addaction name="menuFile"/>
+   <addaction name="menuHelp"/>
+  </widget>
+  <widget class="QStatusBar" name="statusbar"/>
+  <widget class="QToolBar" name="toolBar">
+   <property name="windowTitle">
+    <string>toolBar</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_2">
+   <property name="windowTitle">
+    <string>toolBar_2</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_3">
+   <property name="windowTitle">
+    <string>toolBar_3</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_4">
+   <property name="windowTitle">
+    <string>toolBar_4</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QToolBar" name="toolBar_5">
+   <property name="windowTitle">
+    <string>toolBar_5</string>
+   </property>
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <action name="actionOpen_cplx_file">
+   <property name="text">
+    <string>Open .cplx file</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+O</string>
+   </property>
+  </action>
+  <action name="actionExit">
+   <property name="text">
+    <string>Quit</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+Q</string>
+   </property>
+  </action>
+  <action name="actionClose">
+   <property name="text">
+    <string>Close</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+W</string>
+   </property>
+  </action>
+  <action name="actionSave_image">
+   <property name="text">
+    <string>Save image</string>
+   </property>
+   <property name="shortcut">
+    <string>Ctrl+S</string>
+   </property>
+  </action>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>ReconWidget</class>
+   <extends>QWidget</extends>
+   <header>reconWidget.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>kernelSizeSpinBox</sender>
+   <signal>editingFinished()</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>kernelWidthChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>330</x>
+     <y>260</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>442</x>
+     <y>277</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>matrixSizeSpinBox</sender>
+   <signal>editingFinished()</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>matrixSizeChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>321</x>
+     <y>211</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>441</x>
+     <y>249</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>oversampledMatrixSizeSpinBox</sender>
+   <signal>editingFinished()</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>matrixSizeOSChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>379</x>
+     <y>212</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>448</x>
+     <y>224</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>numIterationsSpinBox</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>numIterationsChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>320</x>
+     <y>382</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>441</x>
+     <y>402</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>regularizationWeightSpinBox</sender>
+   <signal>valueChanged(double)</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>regularizationWeightChanged()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>332</x>
+     <y>433</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>446</x>
+     <y>453</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>windowScaleSpinBox</sender>
+   <signal>valueChanged(double)</signal>
+   <receiver>radialSenseAppBaseMainWindow</receiver>
+   <slot>windowScaleChanged(double)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>339</x>
+     <y>484</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>444</x>
+     <y>501</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+ <slots>
+  <slot>matrixSizeChanged()</slot>
+  <slot>matrixSizeOSChanged()</slot>
+  <slot>regularizationWeightChanged()</slot>
+  <slot>numIterationsChanged()</slot>
+  <slot>kernelWidthChanged()</slot>
+  <slot>windowScaleChanged(double)</slot>
+ </slots>
+</ui>
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.cpp
new file mode 100644
index 0000000..205214f
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.cpp
@@ -0,0 +1,690 @@
+#include "radialSenseAppMainWidget.h"
+
+#include "hoNDArray_fileio.h"
+#include "cuNFFT.h"
+#include "NFFT_utils.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "vector_td_utilities.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgSolver.h"
+#include "b1_map.h"
+
+#include "UIconstants.h"
+#include "GLReconWidget.h"
+
+#include <QtGui/QFileDialog>
+#include <QtGui/QProgressDialog>
+#include <QtGui/QMessageBox>
+#include <QtCore/QSignalMapper>
+
+#include <assert.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+void radialSenseAppMainWindow::resetPrivateData()
+{
+  if( statusLabel ) delete statusLabel;
+  statusLabel = 0x0;
+  ready = false;
+}
+
+radialSenseAppMainWindow::radialSenseAppMainWindow(QWidget *parent) : QMainWindow(parent)
+{
+  statusLabel = 0x0;
+
+  setupUi(this);
+  retranslateUi(this);
+
+  resetPrivateData();
+
+  matrixSizeSpinBox->setValue(MATRIX_SIZE_INITIAL_VALUE);
+  oversampledMatrixSizeSpinBox->setValue(MATRIX_SIZE_OS_INITIAL_VALUE);
+  numIterationsSpinBox->setValue(NUM_ITERATIONS_INITIAL_VALUE);
+  regularizationWeightSpinBox->setValue(REG_WEIGHT_INITIAL_VALUE);
+  kernelSizeSpinBox->setValue(KERNEL_SIZE_INITIAL_VALUE);
+
+  // Menu actions
+  connect(actionOpen_cplx_file, SIGNAL(triggered()), this, SLOT(open()));
+  connect(actionSave_image, SIGNAL(triggered()), this, SLOT(saveImage()));
+  connect(actionClose, SIGNAL(triggered()), this, SLOT(close()));
+  connect(actionExit, SIGNAL(triggered()), qApp, SLOT(quit()));
+
+  // Originally, the thought was to put multiple ReconWidgets in the app. 
+  // This is why the SignalMapper is used rather than the basic signals below.
+
+  // Connect to the reconWidgets' frameChanged slots
+  QSignalMapper *signalMapper1 = new QSignalMapper(this);
+  connect(reconWidget->projectionSelectionScrollBar, SIGNAL(valueChanged(int)), signalMapper1, SLOT(map()));
+  signalMapper1->setMapping(reconWidget->projectionSelectionScrollBar, 1 );
+  connect(signalMapper1, SIGNAL(mapped(int)), this, SLOT(centralProjectionChanged(int)));
+
+  // Connect to the reconWidgets' projectionsPerFrameChanged slots
+  QSignalMapper *signalMapper2 = new QSignalMapper(this);
+  connect(reconWidget->numProjectionsScrollBar, SIGNAL(valueChanged(int)), signalMapper2, SLOT(map()));
+  signalMapper2->setMapping(reconWidget->numProjectionsScrollBar, 1 );
+  connect(signalMapper2, SIGNAL(mapped(int)), this, SLOT(projectionsPerFrameChanged(int)));
+
+  // Allocate encoding operator for non-Cartesian Sense
+  E = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );  
+
+  // Allocate preconditioner
+  D = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+  // Allocate regularization image operator
+  R = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+  R->set_weight( 1.0f );
+
+  // Setup solver
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+  cg.set_preconditioner ( D );          // preconditioning matrix
+  cg.set_max_iterations( get_num_iterations() );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver<float_complext>::OUTPUT_SILENT );
+}
+
+/*
+  Slots
+*/
+
+void radialSenseAppMainWindow::open()
+{
+  // Open dialog box
+  QString filename = QFileDialog::getOpenFileName( this, tr("Open File"), "./", tr("Raw data (*.cplx)"));
+
+  if( filename.size() == 0 )
+    return; // Cancel
+
+  // Close current file
+  close();
+
+  // Update status bar
+  statusLabel = new QLabel(filename);	
+  statusBar()->addWidget(statusLabel);
+
+  // Read samples from disk
+  host_samples = read_nd_array<float_complext>(filename.toLatin1().constData());
+  cout << endl << "loaded dataset with " << host_samples->get_number_of_elements() << " samples." << endl;
+
+  // This is to prevent the user changing the matrix sizes before any data is initially loaded
+  matrixSizeSpinBox->setEnabled(true); 
+  oversampledMatrixSizeSpinBox->setEnabled(true);
+  
+  // Chose startup frame
+  reconWidget->projectionSelectionScrollBar->setValue(get_matrix_size().vec[0]>>2);
+  reconWidget->numProjectionsScrollBar->setValue(34);
+
+  replan();
+
+}
+
+void radialSenseAppMainWindow::saveImage()
+{ /*
+  // Open dialog box
+  QString filename = QFileDialog::getSaveFileName( this, tr("Save image to file"), "./", tr("Raw float data (*.raw)"));
+
+  if( filename.size() == 0 )
+  return; // Cancel
+
+  // This code is copied from 'reconstruct' and slightly modified...
+
+  <cut..>
+
+  LOOP:
+
+  // Save file
+  cudaMemcpy( tmp, devPtr, prod(get_matrix_size())*sizeof(float), cudaMemcpyDeviceToHost );
+  fwrite( tmp, prod(get_matrix_size()), sizeof(float), fout );
+
+  // Report any errors not already caught...
+  err = cudaGetLastError();
+  if( err != cudaSuccess ){
+  QMessageBox::critical( this, tr("Cuda error"), tr(cudaGetErrorString(err)) );
+  actionExit->trigger();
+  }
+	
+  END LOOP:
+
+  reconWidget->projectionNumberSpinBox->setValue(reconWidget->projectionNumberSpinBox->value()+20);
+  }
+
+  fclose(fout);
+  cudaFree(devPtr);
+  */
+}
+
+void radialSenseAppMainWindow::close()
+{	
+  resetPrivateData();
+}
+
+void radialSenseAppMainWindow::replan()
+{
+  QProgressDialog progress("Calibrating", "", 0, 4, this);
+  progress.setWindowModality(Qt::WindowModal);
+  progress.setValue(0);
+  progress.show();
+
+  // Set GUI elements before the plan is created to avoid triggering unneccessary reconstructions
+  unsigned int maxProjections = min(get_matrix_size().vec[0]<<2, (get_num_points_per_array_coil()/get_num_samples_per_projection())>>1);
+  reconWidget->numProjectionsScrollBar->setMaximum(maxProjections);
+  reconWidget->numProjectionsSpinBox->setMaximum(maxProjections);
+  unsigned int maxCentralProjection = get_maximum_central_projection();
+  reconWidget->projectionSelectionScrollBar->setMaximum(maxCentralProjection);
+  reconWidget->projectionNumberSpinBox->setMaximum(maxCentralProjection);
+  unsigned int minCentralProjection = get_num_projections_per_frame()>>1;
+  reconWidget->projectionSelectionScrollBar->setMinimum(minCentralProjection);
+  reconWidget->projectionNumberSpinBox->setMinimum(minCentralProjection);
+									    
+  progress.setValue(1);
+
+  // Pass matrix size to GLReconWidget::initializeGL
+  //	reconWidget->openglCanvas->setMatrixSize( get_matrix_size().vec[0], get_matrix_size().vec[1] );
+
+  progress.setValue(2);
+
+  const unsigned int samples_per_profile = get_num_samples_per_projection();
+  const unsigned int num_profiles = get_num_points_per_array_coil() / samples_per_profile;
+  const unsigned int profiles_per_frame = get_num_projections_per_frame();
+  const unsigned int frames_per_reconstruction = NUM_FRAMES_PER_CSM_RECON;
+  const unsigned int profiles_per_reconstruction = get_num_projections_per_frame()*frames_per_reconstruction;
+  const unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  dcw  = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (float)get_matrix_size_os().vec[0]/(float)get_matrix_size().vec[0], 
+      float(1)/((float)samples_per_profile/(float)max(get_matrix_size().vec[0],get_matrix_size().vec[1])) );
+  
+  progress.setValue(3);
+
+  // Setup plan for convolution
+  plan.setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );
+  
+  // Temporary oversampled image buffer
+  vector<unsigned int> image_os_dims = uint64d_to_vector<2>(get_matrix_size_os()); 
+  image_os_dims.push_back(frames_per_reconstruction); image_os_dims.push_back(get_num_coils());    
+  cuNDArray<float_complext> *image_os = new cuNDArray<float_complext>();
+  image_os->create(&image_os_dims);
+
+  // Extract coil sensitivity maps and training data using all the data
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_reconstruction; iteration++ ) {
+    
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<floatd2> > traj = compute_radial_trajectory_golden_ratio_2d<float>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, iteration*profiles_per_reconstruction );
+    
+    // Preprocess
+    plan.preprocess( traj.get(), cuNFFT_plan<float,2>::NFFT_PREP_NC2C );
+    traj.reset();
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<float_complext> > csm_data =
+      upload_data( iteration*profiles_per_reconstruction, samples_per_profile, samples_per_reconstruction,
+		   num_profiles*samples_per_profile, get_num_coils(), host_samples.get() );
+    
+    // Accumulate k-space for CSM estimation
+    plan.convolve( csm_data.get(), image_os, dcw.get(), cuNFFT_plan<float,2>::NFFT_CONV_NC2C, (iteration==0) ? false : true );
+    csm_data.reset();
+  }
+  
+  // We now have 'frames_per_reconstruction' k-space images of each coil. Add these up.
+  boost::shared_ptr< cuNDArray<float_complext> > acc_image_os = sum<float_complext>( image_os, 2 );
+  delete image_os; image_os = 0x0;
+  
+  // Complete gridding of k-space CSM image
+  plan.fft( acc_image_os.get(), cuNFFT_plan<float,2>::NFFT_BACKWARDS );
+  plan.deapodize( acc_image_os.get() );
+  
+  // Remove oversampling
+  vector<unsigned int> image_dims = uint64d_to_vector<2>(get_matrix_size()); image_dims.push_back(get_num_coils());
+  cuNDArray<float_complext> *image = new cuNDArray<float_complext>();
+  image->create(&image_dims);
+  crop<float_complext,2>( (get_matrix_size_os()-get_matrix_size())>>1, acc_image_os.get(), image );
+  acc_image_os.reset();
+  
+  // Estimate CSM
+  csm = estimate_b1_map<float,2>( image );
+
+  progress.setValue(4);
+
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() ); 
+  E->set_csm(csm);
+
+  // Setup regularization operator
+  image_dims = uint64d_to_vector<2>(get_matrix_size());
+  cuNDArray<float_complext> *reg_image = new cuNDArray<float_complext>();
+  reg_image->create( &image_dims );
+
+  E->mult_csm_conj_sum( image, reg_image );
+  R->compute( reg_image );
+
+  delete image; image = 0x0; 
+  delete reg_image; reg_image = 0x0; 
+
+  // Define preconditioning weights
+  update_preconditioning_weights();
+    
+  progress.setValue(5);
+
+  ready = true;
+
+  // Trigger the #projections slot
+  reconWidget->numProjectionsScrollBar->setValue(reconWidget->numProjectionsScrollBar->value()+1);
+
+  // Perform reconstruction
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::update_preconditioning_weights()
+{
+  boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<float> > R_diag = R->get();
+  *R_diag *= get_kappa();
+  *_precon_weights += *R_diag;
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+  D->set_weights( precon_weights );
+}
+
+void radialSenseAppMainWindow::projectionsPerFrameChanged(int)
+{
+  // the integer is an 'id' not the slider value!
+
+  unsigned int value = get_num_projections_per_frame();
+
+  // Enforce even values
+  if( value%2 ){
+    value--;
+    reconWidget->numProjectionsScrollBar->setValue(value);
+    return;
+  }
+
+  if(!ready) return;
+
+  // Remove the Qt lag of the slider rendering
+  QApplication::processEvents();
+
+  // The range of the frames slider/spinbox has changed
+  unsigned int maxCentralProjection = get_maximum_central_projection();
+  reconWidget->projectionSelectionScrollBar->setMaximum(maxCentralProjection);
+  reconWidget->projectionNumberSpinBox->setMaximum(maxCentralProjection);
+  reconWidget->projectionSelectionScrollBar->setSingleStep(value>>2);
+  reconWidget->projectionNumberSpinBox->setSingleStep(value>>2);
+
+  const unsigned int samples_per_profile = get_num_samples_per_projection();
+  const unsigned int profiles_per_frame = get_num_projections_per_frame();
+  
+  // Density compensation weights are constant throughout all reconstrutions
+  dcw  = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (float)get_matrix_size_os().vec[0]/(float)get_matrix_size().vec[0], 
+      float(1)/((float)samples_per_profile/(float)max(get_matrix_size().vec[0],get_matrix_size().vec[1])) );
+  
+  // Set density compensation weights
+  E->set_dcw(dcw);
+
+  // Reconstruct
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::centralProjectionChanged(int id)
+{
+  // the integer is an 'id' not the slider value!
+
+  // Enforce even values
+  unsigned int value = get_central_projection();
+  if( value%2 ){
+    value--;
+    reconWidget->projectionSelectionScrollBar->setValue(value);
+    return;
+  }
+
+  if(!ready) return;
+
+  // Remove the lag of the slider rendering
+  QApplication::processEvents();
+
+  // Perform reconstruction
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::matrixSizeChanged()
+{
+  static unsigned int lastValue = MATRIX_SIZE_INITIAL_VALUE;
+
+  unsigned int value = matrixSizeSpinBox->value();
+  unsigned int value_os = oversampledMatrixSizeSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  if(!ready) return;
+
+  // Pass matrix size to GLReconWidget
+  reconWidget->openglCanvas->setMatrixSize( value, value );
+	
+  if( value_os < value ){
+    oversampledMatrixSizeSpinBox->setValue(value);
+  }
+  
+  // and encoding matrix
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );  
+  
+  replan();
+}
+
+void radialSenseAppMainWindow::matrixSizeOSChanged()
+{
+  static unsigned int lastValue = MATRIX_SIZE_OS_INITIAL_VALUE;
+
+  unsigned int value = matrixSizeSpinBox->value();
+  unsigned int value_os = oversampledMatrixSizeSpinBox->value();
+
+  if( value_os == lastValue )
+    return;
+  else 
+    lastValue = value_os;
+
+  if( value_os < value ){
+    oversampledMatrixSizeSpinBox->setValue(value);
+    return;
+  }
+	
+  if( value_os%2 ){
+    value_os++;
+    oversampledMatrixSizeSpinBox->setValue(value_os);
+    return;
+  }
+
+  if(!ready) return;
+
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );  
+
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::kernelWidthChanged()
+{
+  static double lastValue = KERNEL_SIZE_INITIAL_VALUE;
+
+  double value = kernelSizeSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  if(!ready) return;
+
+  E->setup( get_matrix_size(), get_matrix_size_os(), get_kernel_width() );  
+  
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::numIterationsChanged()
+{
+  static unsigned int lastValue = NUM_ITERATIONS_INITIAL_VALUE;
+
+  unsigned int value = numIterationsSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  cg.set_max_iterations( get_num_iterations() );
+
+  if(!ready) return;
+
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::regularizationWeightChanged()
+{
+  static double lastValue = REG_WEIGHT_INITIAL_VALUE;
+
+  double value = regularizationWeightSpinBox->value();
+
+  if( value == lastValue )
+    return;
+  else 
+    lastValue = value;
+
+  // Update D
+  update_preconditioning_weights();
+  
+  // Update operator R 
+  R->set_weight( get_kappa() );
+
+  if(!ready) return;
+
+  reconstruct();
+}
+
+void radialSenseAppMainWindow::windowScaleChanged(double)
+{
+  if(!ready) return;
+  reconstruct();
+}
+
+/*
+  Reconstruct frame
+*/
+
+void radialSenseAppMainWindow::reconstruct()
+{
+  if(!ready) return;
+  
+  // Check if any data has been loaded
+  if( host_samples->get_number_of_elements() == 0 )
+    return;
+  
+  // See if there is any uncaught errors before starting
+  cudaError_t err;
+  err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    QMessageBox::critical( this, tr("Cuda error"), tr(cudaGetErrorString(err)) );
+    actionExit->trigger();
+  }
+
+  // Map result to OpenGL
+  reconWidget->openglCanvas->mapPBO();
+
+  // Be optimistic...
+  bool success = true;
+
+  const unsigned int samples_per_profile = get_num_samples_per_projection();
+  const unsigned int num_profiles = get_num_points_per_array_coil() / samples_per_profile;
+  const unsigned int profiles_per_frame = get_num_projections_per_frame();
+  const unsigned int frames_per_reconstruction = 1; 
+  const unsigned int profiles_per_reconstruction = get_num_projections_per_frame()*frames_per_reconstruction;
+  const uint64d2 matrix_size = get_matrix_size();
+  const uint64d2 matrix_size_os = get_matrix_size_os();
+  const unsigned int num_coils = get_num_coils();
+  const unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  // Determine trajectories
+  boost::shared_ptr< cuNDArray<floatd2> > traj = compute_radial_trajectory_golden_ratio_2d<float>
+    ( samples_per_profile, profiles_per_frame, frames_per_reconstruction,  get_first_projection() );
+  
+  // Upload data
+  boost::shared_ptr< cuNDArray<float_complext> > data =
+    upload_data( get_first_projection(), samples_per_profile, samples_per_reconstruction,
+		 num_profiles*samples_per_profile, num_coils, host_samples.get() );
+    
+  // Set current trajectory and trigger NFFT preprocessing
+  E->preprocess(traj.get());
+  
+  // Form rhs (use result array to save memory)
+  vector<unsigned int> rhs_dims = uint64d_to_vector<2>(matrix_size); rhs_dims.push_back(frames_per_reconstruction);
+  cuNDArray<float_complext> rhs; rhs.create(&rhs_dims);
+  E->mult_MH( data.get(), &rhs );
+  
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<float_complext> > cgresult = cg.solve_from_rhs(&rhs);
+  
+  // Magnitudes image for visualization
+  boost::shared_ptr< cuNDArray<float> > tmp_res = abs<float_complext>(cgresult.get());
+  normalize( tmp_res.get(), get_window_scale() );
+  
+  // Copy to OpenGL/pbo
+  cudaMemcpy( reconWidget->openglCanvas->getDevPtr(),
+	      tmp_res->get_data_ptr(),
+	      prod(matrix_size)*sizeof(float), cudaMemcpyDeviceToDevice );
+  
+  // Report any errors not already caught...
+  err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    QMessageBox::critical( this, tr("Cuda error"), tr(cudaGetErrorString(err)) );
+    actionExit->trigger();
+  }
+  
+  reconWidget->openglCanvas->unmapPBO();
+  
+  if( !success ){
+    QMessageBox::critical( this, tr("Reconstruction error"), tr("Check console. Quitting.") );
+    actionExit->trigger();
+    exit(EXIT_FAILURE);
+  }
+    
+  reconWidget->openglCanvas->updateGL();
+}
+
+/*
+  "Gets..."
+*/
+
+uint64d2 radialSenseAppMainWindow::get_matrix_size()
+{
+  int value = matrixSizeSpinBox->value();
+  return uint64d2( value, value );
+}
+
+uint64d2 radialSenseAppMainWindow::get_matrix_size_os()
+{
+  int value = oversampledMatrixSizeSpinBox->value();
+  return uint64d2( value, value );
+}
+
+float radialSenseAppMainWindow::get_kernel_width()
+{
+  double value = kernelSizeSpinBox->value();
+  return (float) value;	
+}
+
+float radialSenseAppMainWindow::get_window_scale()
+{
+  double value = windowScaleSpinBox->value();
+  return (float) value;	
+}
+
+unsigned int radialSenseAppMainWindow::get_num_samples_per_projection()
+{
+  if( host_samples->get_number_of_dimensions() > 0 )
+    return host_samples->get_size(0);
+  else return 0;
+}
+
+unsigned int radialSenseAppMainWindow::get_first_projection()
+{
+  int value = reconWidget->projectionNumberSpinBox->value();
+  value -= get_num_projections_per_frame()>>1;
+  if( value<0 )
+    value = 0;
+  return value;
+}
+
+unsigned int radialSenseAppMainWindow::get_central_projection()
+{
+  int value = reconWidget->projectionSelectionScrollBar->value();
+  return value;
+}
+
+unsigned int radialSenseAppMainWindow::get_maximum_central_projection()
+{
+  if( get_num_samples_per_projection() == 0 )
+    return 0;
+	
+  unsigned int maxCentralProjection = get_num_points_per_array_coil()/get_num_samples_per_projection()-get_num_projections_per_frame()/2-get_num_projections_per_frame()%2;
+  return maxCentralProjection;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_projections_per_frame()
+{
+  int value = reconWidget->numProjectionsSpinBox->value();
+  return value;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_coils()
+{
+  if( host_samples->get_number_of_dimensions() < 3 )
+    return 0;
+
+  unsigned int val;
+  if( host_samples->get_number_of_dimensions() == 3 )
+    val = host_samples->get_size(2);
+  else{
+    printf("\nUnknown number of dimensions in dataset. Quitting.\n");
+    exit(1);
+  }
+  
+  return val;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_points_per_reconstruction()
+{
+  unsigned int val = get_num_samples_per_projection()*get_num_projections_per_frame();
+  return val;
+}
+
+hoNDArray<complext<float> >* radialSenseAppMainWindow::get_sample_values_array()
+{
+  return host_samples.get();
+}
+
+unsigned int radialSenseAppMainWindow::get_num_points_per_array_coil()
+{
+  if(host_samples->get_number_of_dimensions()<2)
+    return 0;
+
+  unsigned int val = host_samples->get_size(0)*host_samples->get_size(1);
+  return val;
+}
+
+unsigned int radialSenseAppMainWindow::get_num_iterations()
+{
+  int value = numIterationsSpinBox->value();
+  return value;
+}
+
+inline float radialSenseAppMainWindow::get_kappa()
+{
+  double value = regularizationWeightSpinBox->value();
+  return (float)value;
+}
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<float_complext> >
+radialSenseAppMainWindow::upload_data( unsigned int profile_offset, unsigned int samples_per_profile, unsigned int samples_per_reconstruction, 
+				       unsigned int total_samples_per_coil, unsigned int num_coils,
+				       hoNDArray<float_complext> *host_data )
+{
+  vector<unsigned int> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<float_complext> *data = new cuNDArray<float_complext>();
+  data->create( &dims );
+  
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+profile_offset*samples_per_profile, 
+		samples_per_reconstruction*sizeof(float_complext), cudaMemcpyHostToDevice );
+  
+  return boost::shared_ptr< cuNDArray<float_complext> >(data);
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.h
new file mode 100644
index 0000000..64c7c18
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/radialSenseAppMainWidget.h
@@ -0,0 +1,134 @@
+#pragma once
+
+// Gadgetron includes
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray.h"
+#include "cuNDArray.h"
+#include "cuNFFT.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuImageOperator.h"
+#include "cuCgPreconditioner.h"
+#include "complext.h"
+
+#include <boost/smart_ptr.hpp>
+
+// Autogenerated header by uic
+#include "ui_radialSenseAppBaseMainWidget.h" 
+
+using namespace Gadgetron; // only because this header file is not distributed...
+
+class radialSenseAppMainWindow : public QMainWindow, public Ui::radialSenseAppBaseMainWindow
+{
+  // Macro for the Qt gui
+  Q_OBJECT
+ 
+  public:
+
+  // Constructor
+  radialSenseAppMainWindow(QWidget *parent = 0);
+
+  // Reconstruct frame
+  void reconstruct();
+
+  // Get matrix size
+  inline uint64d2 get_matrix_size();
+
+  // Get oversampled matrix size
+  inline uint64d2 get_matrix_size_os();
+
+  // Get number of coils
+  inline unsigned int get_num_coils();
+
+  // Get kernel width
+  inline float get_kernel_width();
+
+  // Get kappa (regularization weight)
+  inline float get_kappa();
+
+  // Get first projection
+  inline unsigned int get_first_projection();
+
+  // Get central projection
+  inline unsigned int get_central_projection();
+
+  // Get maximum central projection
+  inline unsigned int get_maximum_central_projection();
+
+  // Get number of projections per frame
+  inline unsigned int get_num_projections_per_frame();
+
+  // Number of samples per projection
+  inline unsigned int get_num_samples_per_projection();
+
+  // Number of points per reconstruction
+  inline unsigned int get_num_points_per_reconstruction();
+
+  // Get host side sample data array
+  inline hoNDArray<complext<float> >* get_sample_values_array();
+
+  // Get number of points per coil in data array
+  unsigned int get_num_points_per_array_coil();
+
+  // Get number of iterations
+  unsigned int get_num_iterations();
+
+  // Get window scale
+  float get_window_scale();
+  
+  boost::shared_ptr< cuNDArray<float_complext> >
+  upload_data( unsigned int profile_offset, unsigned int samples_per_profile, unsigned int samples_per_reconstruction, 
+	       unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<float_complext> *host_data );
+
+private:
+  void resetPrivateData();
+  void replan();
+  void update_preconditioning_weights();	       
+
+private slots:
+  void open();
+  void close();
+  void saveImage();
+  void matrixSizeChanged();
+  void matrixSizeOSChanged();
+  void regularizationWeightChanged();
+  void projectionsPerFrameChanged(int);
+  void centralProjectionChanged(int);
+  void numIterationsChanged();
+  void kernelWidthChanged();
+  void windowScaleChanged(double);
+
+private:
+	
+  // Reconstruction plan
+  cuNFFT_plan<float,2> plan;
+
+  // Define conjugate gradient solver
+  cuCgSolver<float_complext> cg;
+
+  // Define non-Cartesian Sense solver
+  boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E;
+
+  // Define preconditioner
+  boost::shared_ptr< cuCgPreconditioner<float_complext> > D;
+  
+  // Define regularization image operator
+  boost::shared_ptr< cuImageOperator<float_complext> > R;
+  
+  // CSM
+  boost::shared_ptr< cuNDArray<float_complext> > csm;
+
+  // Density compensation weights
+  boost::shared_ptr< cuNDArray<float> > dcw;	
+
+  // Host data array
+  boost::shared_ptr< hoNDArray<float_complext> > host_samples;
+  
+  // Label for the status bar
+  QLabel *statusLabel;
+
+  // Are we set up for reconstruction?
+  bool ready;
+};
+
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconBaseWidget.ui b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconBaseWidget.ui
new file mode 100644
index 0000000..ad03226
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconBaseWidget.ui
@@ -0,0 +1,303 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>ReconBaseWidget</class>
+ <widget class="QWidget" name="ReconBaseWidget">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>272</width>
+    <height>414</height>
+   </rect>
+  </property>
+  <property name="sizePolicy">
+   <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+    <horstretch>0</horstretch>
+    <verstretch>0</verstretch>
+   </sizepolicy>
+  </property>
+  <property name="minimumSize">
+   <size>
+    <width>272</width>
+    <height>414</height>
+   </size>
+  </property>
+  <property name="windowTitle">
+   <string>ReconBaseWidget</string>
+  </property>
+  <property name="locale">
+   <locale language="English" country="UnitedKingdom"/>
+  </property>
+  <widget class="QFrame" name="decorationFrame">
+   <property name="geometry">
+    <rect>
+     <x>0</x>
+     <y>0</y>
+     <width>272</width>
+     <height>381</height>
+    </rect>
+   </property>
+   <property name="frameShape">
+    <enum>QFrame::Box</enum>
+   </property>
+   <property name="frameShadow">
+    <enum>QFrame::Plain</enum>
+   </property>
+   <widget class="GLReconWidget" name="openglCanvas" native="true">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>8</y>
+      <width>256</width>
+      <height>256</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="MinimumExpanding">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="minimumSize">
+     <size>
+      <width>256</width>
+      <height>256</height>
+     </size>
+    </property>
+    <property name="locale">
+     <locale language="English" country="UnitedKingdom"/>
+    </property>
+   </widget>
+   <widget class="QScrollBar" name="projectionSelectionScrollBar">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>290</y>
+      <width>176</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="orientation">
+     <enum>Qt::Horizontal</enum>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_1">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>270</y>
+      <width>140</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="text">
+     <string>Central projection</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_2">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>320</y>
+      <width>160</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="text">
+     <string>Number of projections per frame</string>
+    </property>
+   </widget>
+   <widget class="QScrollBar" name="numProjectionsScrollBar">
+    <property name="geometry">
+     <rect>
+      <x>8</x>
+      <y>340</y>
+      <width>186</width>
+      <height>16</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="minimum">
+     <number>4</number>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="value">
+     <number>32</number>
+    </property>
+    <property name="orientation">
+     <enum>Qt::Horizontal</enum>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="projectionNumberSpinBox">
+    <property name="geometry">
+     <rect>
+      <x>191</x>
+      <y>287</y>
+      <width>71</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="alignment">
+     <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+   </widget>
+   <widget class="QSpinBox" name="numProjectionsSpinBox">
+    <property name="geometry">
+     <rect>
+      <x>201</x>
+      <y>337</y>
+      <width>61</width>
+      <height>22</height>
+     </rect>
+    </property>
+    <property name="sizePolicy">
+     <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+      <horstretch>0</horstretch>
+      <verstretch>0</verstretch>
+     </sizepolicy>
+    </property>
+    <property name="alignment">
+     <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+    </property>
+    <property name="minimum">
+     <number>4</number>
+    </property>
+    <property name="maximum">
+     <number>100</number>
+    </property>
+    <property name="singleStep">
+     <number>2</number>
+    </property>
+    <property name="value">
+     <number>32</number>
+    </property>
+   </widget>
+  </widget>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>GLReconWidget</class>
+   <extends>QWidget</extends>
+   <header>GLReconWidget.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>projectionNumberSpinBox</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>projectionSelectionScrollBar</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>235</x>
+     <y>298</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>162</x>
+     <y>298</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>numProjectionsSpinBox</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>numProjectionsScrollBar</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>231</x>
+     <y>351</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>180</x>
+     <y>353</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>projectionSelectionScrollBar</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>projectionNumberSpinBox</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>124</x>
+     <y>297</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>234</x>
+     <y>294</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>numProjectionsScrollBar</sender>
+   <signal>valueChanged(int)</signal>
+   <receiver>numProjectionsSpinBox</receiver>
+   <slot>setValue(int)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>102</x>
+     <y>340</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>253</x>
+     <y>348</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+ <slots>
+  <slot>animationSpeedChanged(int)</slot>
+  <slot>projectionsPerFrameChanged(int)</slot>
+  <slot>frameChanged(int)</slot>
+ </slots>
+</ui>
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.cpp
new file mode 100644
index 0000000..e6ea6cc
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.cpp
@@ -0,0 +1,7 @@
+#include "reconWidget.h"
+
+ReconWidget::ReconWidget(QWidget *parent) : QWidget(parent)
+{
+  setupUi(this);
+  retranslateUi(this);
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.h b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.h
new file mode 100644
index 0000000..585c954
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_gui/reconWidget.h
@@ -0,0 +1,13 @@
+#pragma once
+
+// Autogenerated header by uic
+#include "ui_reconBaseWidget.h" 
+
+class ReconWidget : public QWidget, public Ui::ReconBaseWidget
+{
+  Q_OBJECT
+  
+  public:
+  // Constructor
+  ReconWidget(QWidget* parent = 0);
+};
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/CMakeLists.txt
new file mode 100644
index 0000000..e780877
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(radial_ktsense  main.cpp)
+
+target_link_libraries(radial_ktsense gpucore gpuparallelmri gpuoperators gpunfft hostutils gpusolvers ${CUDA_LIBRARIES})
+
+install(TARGETS radial_ktsense DESTINATION bin)
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/main.cpp b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/main.cpp
new file mode 100644
index 0000000..faeed04
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/2d_golden_ratio_kt/main.cpp
@@ -0,0 +1,314 @@
+// Gadgetron includes
+#include "hoNDArray_fileio.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "vector_td_utilities.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianKtSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuImageOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuCgSolver.h"
+#include "cuNDFFT.h"
+#include "b1_map.h"
+#include "parameterparser.h"
+#include "GPUTimer.h"
+
+// Std includes
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+using namespace Gadgetron;
+
+// Define desired precision
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+// Upload samples for one reconstruction from host to device
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils,
+	     hoNDArray<_complext> *host_data )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "Sample data file name", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "result.cplx" );
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true );
+  parms.add_parameter( 'f', COMMAND_LINE_INT,    1, "Frames per reconstruction", true, "32" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "25" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Kappa", true, "0.25" );
+
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running reconstruction with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer;
+  
+  // Load sample data from disk
+  timer = new GPUTimer("\nLoading data");
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+  delete timer;
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Configuration from the host data
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real kappa = parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = parms.get_parameter('f')->get_int_value();
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  cout << endl << "#samples/profile: " << samples_per_profile;
+  cout << endl << "#profiles/frame: " << profiles_per_frame;
+  cout << endl << "#profiles: " << num_profiles;
+  cout << endl << "#coils: " << num_coils;
+  cout << endl << "#frames/reconstruction " << frames_per_reconstruction;
+  cout << endl << "#profiles/reconstruction " << profiles_per_reconstruction;
+  cout << endl << "#samples/reconstruction " << samples_per_reconstruction << endl << endl;
+
+  // Density compensation weights are constant throughout all reconstrutions
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os[0]/(_real)matrix_size[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size[0],matrix_size[1])) );
+  
+  // Define encoding matrix for non-Cartesian kt-SENSE
+  boost::shared_ptr< cuNonCartesianKtSenseOperator<_real,2> > E( new cuNonCartesianKtSenseOperator<_real,2>() );
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+
+  // Notify encoding operator of dcw
+  E->set_dcw(dcw);
+
+  // Use a rhs buffer to estimate the csm -- from all the data
+  //
+
+  unsigned int profiles_per_subcycle = matrix_size_os[0]<<1; // causes no alising
+  unsigned int num_subcycles = profiles_per_subcycle / profiles_per_frame;
+  unsigned int num_cycles = num_profiles / profiles_per_subcycle;
+
+  std::cout << "Buffer cycles/sybcycles: " << num_cycles << " / " << num_subcycles << std::endl;
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+
+  // The first acquired profiles are often undesired. Skip the first two cycles...
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, num_cycles-1, num_subcycles );
+  rhs_buffer->set_dcw(dcw);
+   
+  // Fill rhs buffer
+  //
+ 
+  timer = new GPUTimer("CSM estimation");
+    
+  // Go through all the data...
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_reconstruction );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+        
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+  
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+
+  E->set_csm(csm);
+
+  acc_images.reset();
+  rhs_buffer.reset();
+ 
+  delete timer;
+
+  // 
+  // Setup radial kt-SENSE reconstructions
+  //
+    
+  // Define regularization image operator
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() );
+  R->set_weight( kappa );
+
+  // Define preconditioning operator
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  boost::shared_ptr< cuNDArray<_real> > ___precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > __precon_weights = expand<_real>( ___precon_weights.get(), frames_per_reconstruction );
+  ___precon_weights.reset();
+
+  // Setup conjugate gradient solver
+  cuCgSolver< _complext> cg;
+  cg.set_encoding_operator( E );        // encoding matrix
+  cg.add_regularization_operator( R );  // regularization matrix
+  cg.set_preconditioner ( D );          // preconditioning matrix
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-6 );
+  cg.set_output_mode( cuCgSolver< _complext>::OUTPUT_VERBOSE );
+      
+  // Reconstruct all SENSE frames iteratively
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+
+  // Allocate space for result
+  vector<size_t> image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+
+  cuNDArray<_complext> result = cuNDArray<_complext>(&image_dims);
+  
+  // Define shutter for training data
+  _real shutter_radius = ((_real)matrix_size_os[0]/(_real)matrix_size[0])*(_real)profiles_per_frame/(_real)M_PI;
+  shutter_radius /= _real(2);
+  std::cout << "Shutter radius: " << shutter_radius << std::endl;
+
+  vector<size_t> image_os_dims = to_std_vector(matrix_size_os); 
+  image_os_dims.push_back(frames_per_reconstruction); image_os_dims.push_back(num_coils);    
+  cuNDArray<_complext> *image_os = new cuNDArray<_complext>(&image_os_dims);
+
+  timer = new GPUTimer("Full SENSE reconstruction.");
+  
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+
+    // 
+    // Estimate training data
+    // 
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, reconstruction*profiles_per_reconstruction );
+    
+    // Preprocess
+    image_dims.pop_back(); image_dims.push_back(frames_per_reconstruction); 
+    E->set_domain_dimensions(&image_dims);
+    E->preprocess( traj.get() );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+
+    E->set_codomain_dimensions(data->get_dimensions().get());    
+
+    // Convolve to Cartesian k-space
+    E->get_plan()->convolve( data.get(), image_os, dcw.get(), cuNFFT_plan<_real,2>::NFFT_CONV_NC2C );
+
+    // Apply shutter
+    fill_border<_complext,2>( shutter_radius, image_os );
+
+    E->get_plan()->fft( image_os, cuNFFT_plan<_real,2>::NFFT_BACKWARDS );
+    E->get_plan()->deapodize( image_os );
+
+    // Remove oversampling
+    image_dims.push_back(num_coils);
+    cuNDArray<_complext> *image = new cuNDArray<_complext>(&image_dims);
+    crop<_complext,2>( (matrix_size_os-matrix_size)>>1, image_os, image );
+    image_dims.pop_back();
+
+    // Compute regularization image
+    cuNDArray<_complext> *reg_image = new cuNDArray<_complext>(&image_dims);
+
+    E->mult_csm_conj_sum( image, reg_image );
+    cuNDFFT<_real>::instance()->ifft( reg_image, 2, true );
+
+
+    R->compute( reg_image );
+
+    delete reg_image; reg_image = 0x0;
+    delete image; image = 0x0;
+    
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<_real> > _precon_weights( new cuNDArray<_real>(*__precon_weights.get()));
+    boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+    *R_diag *= kappa;
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());
+    boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    
+    // Define preconditioning matrix
+    D->set_weights( precon_weights );
+    precon_weights.reset();
+      
+    //
+    // Conjugate gradient solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > cgresult;
+    {
+      GPUTimer timer("GPU Conjugate Gradient solve");
+      cgresult = cg.solve(data.get());
+    }
+
+    // Goto from x-f to x-t space
+    cuNDFFT<_real>::instance()->fft( cgresult.get(), 2 );
+    
+    // Copy cgresult to result
+    cuNDArray<_complext> tmp(&image_dims, result.get_data_ptr()+reconstruction*prod(matrix_size)*frames_per_reconstruction);    
+    tmp = *(cgresult.get());  
+  }
+  
+  delete timer;
+  delete image_os; image_os = 0x0;
+  csm.reset();
+
+  // All done, write out the result
+
+  timer = new GPUTimer("Writing out result");
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_result = result.to_host();
+  write_nd_array<_complext>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+    
+  boost::shared_ptr< hoNDArray<_real> > host_norm = abs(&result)->to_host();
+  write_nd_array<_real>( host_norm.get(), "result.real" );
+  
+  delete timer;
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/mri/sense/noncartesian/radial/CMakeLists.txt b/apps/standalone/gpu/mri/sense/noncartesian/radial/CMakeLists.txt
new file mode 100644
index 0000000..6add522
--- /dev/null
+++ b/apps/standalone/gpu/mri/sense/noncartesian/radial/CMakeLists.txt
@@ -0,0 +1,10 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/pmri/gpu
+  )
+
+add_subdirectory(2d_golden_ratio)
+add_subdirectory(2d_golden_ratio_kt)
+
+#if (QT4_FOUND AND GLEW_FOUND AND GLUT_FOUND AND OPENGL_FOUND)
+#  add_subdirectory(2d_golden_ratio_gui)
+#endif (QT4_FOUND AND GLEW_FOUND AND GLUT_FOUND AND OPENGL_FOUND)
diff --git a/apps/standalone/gpu/registration/2d/CMakeLists.txt b/apps/standalone/gpu/registration/2d/CMakeLists.txt
new file mode 100644
index 0000000..a0b8189
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/CMakeLists.txt
@@ -0,0 +1,52 @@
+add_executable(register_HS_2d_gpu register_HS_2d.cpp)
+add_executable(register_CGHS_2d_gpu register_CGHS_2d.cpp)
+add_executable(register_CK_2d_gpu register_CK_2d.cpp)
+add_executable(test_reg_sense_recon test_reg_sense_recon.cpp)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+)
+
+target_link_libraries(register_HS_2d_gpu 
+  hostutils 
+  gpureg 
+  gpucore 
+  gpuoperators
+  gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+target_link_libraries(register_CK_2d_gpu
+  hostutils 
+  gpureg 
+  gpucore 
+  gpuoperators
+  gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+target_link_libraries(test_reg_sense_recon 
+  hostutils 
+  gpureg 
+  gpucore 
+  gpuoperators
+  gpusolvers 
+  gpunfft 
+  gpuparallelmri 
+  ${CUDA_LIBRARIES}
+  )
+  
+  target_link_libraries(register_CGHS_2d_gpu 
+  hostutils 
+  gpureg 
+  gpucore 
+  gpuoperators
+  gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+install(TARGETS 
+  register_HS_2d_gpu
+  register_CGHS_2d_gpu  
+  register_CK_2d_gpu 
+  DESTINATION bin)
diff --git a/apps/standalone/gpu/registration/2d/register_CGHS_2d.cpp b/apps/standalone/gpu/registration/2d/register_CGHS_2d.cpp
new file mode 100644
index 0000000..f6bf8a8
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/register_CGHS_2d.cpp
@@ -0,0 +1,134 @@
+/*
+  An example of how to register two 2d images using Horn-Schunk optical flow
+*/
+
+// Gadgetron includes
+#include "cuHSOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+#include "parameterparser.h"
+#include "cuCGHSOFSolver.h"
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.1" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( num_fixed_dims < num_moving_dims  ){
+    host_fixed = expand( host_fixed.get(), host_moving->get_size(2) );
+    num_fixed_dims = host_fixed->get_number_of_dimensions();
+  }
+
+  if( num_moving_dims < num_moving_dims  ){
+    host_moving = expand( host_moving.get(), host_fixed->get_size(2) );
+    num_moving_dims = host_moving->get_number_of_dimensions();
+  }
+
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,2> > R( new cuLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  cuCGHSOFSolver<_real,2> HS;
+  HS.set_interpolator( R );
+  HS.set_output_mode( cuCGHSOFSolver<_real,2>::OUTPUT_VERBOSE );
+  HS.get_solver()->set_max_iterations( 100 );
+  HS.get_solver()->set_output_mode(cuCgSolver<_real>::OUTPUT_VERBOSE);
+  HS.set_num_multires_levels( multires_levels );
+  HS.set_alpha(alpha);
+  
+
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result = HS.solve( &fixed_image, &moving_image );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = HS.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/2d/register_CK_2d.cpp b/apps/standalone/gpu/registration/2d/register_CK_2d.cpp
new file mode 100644
index 0000000..5baaa81
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/register_CK_2d.cpp
@@ -0,0 +1,129 @@
+/*
+  An example of how to register two 2d images using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "cuCKOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+#include "GadgetronTimer.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+  
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,2> > R( new cuLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  cuCKOpticalFlowSolver<_real,2> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( cuCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_num_multires_levels( multires_levels );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result;
+  {
+      GadgetronTimer timer("Running registration");
+      result = CK.solve( &fixed_image, &moving_image );
+  }
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = CK.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/2d/register_HS_2d.cpp b/apps/standalone/gpu/registration/2d/register_HS_2d.cpp
new file mode 100644
index 0000000..fbf881f
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/register_HS_2d.cpp
@@ -0,0 +1,122 @@
+/*
+  An example of how to register two 2d images using Horn-Schunk optical flow
+*/
+
+// Gadgetron includes
+#include "cuHSOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.1" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 2 || num_fixed_dims == 3)  ){
+    cout << endl << "The fixed image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 2 || num_moving_dims == 3)  ){
+    cout << endl << "The moving image is not two- or three-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use bilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,2> > R( new cuLinearResampleOperator<_real,2>() );
+
+  // Setup solver
+  //
+  
+  cuHSOpticalFlowSolver<_real,2> HS;
+  HS.set_interpolator( R );
+  HS.set_output_mode( cuHSOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  HS.set_max_num_iterations_per_level( 500 );
+  HS.set_num_multires_levels( multires_levels );
+  HS.set_alpha(alpha);
+  HS.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result = HS.solve( &fixed_image, &moving_image );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = HS.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/2d/test_reg_sense_recon.cpp b/apps/standalone/gpu/registration/2d/test_reg_sense_recon.cpp
new file mode 100644
index 0000000..1c12ae3
--- /dev/null
+++ b/apps/standalone/gpu/registration/2d/test_reg_sense_recon.cpp
@@ -0,0 +1,568 @@
+#define PAD_Z
+
+/*
+
+  This is an example of how to use optical flow image registration 
+  and the image resampling operator for image reconstruction.
+  
+  This example uses golden ratio Sense MRI for demonstration. 
+  It was tested with a free-breathing cardiac acquisition.
+
+  !!! Note !!!
+  ------------
+  No cardiac phase binning is performed.
+  And since the registration has trouble handling large, 
+  non-rigid deformations such as the heart contraction
+  it serves only for demonstration purposes. 
+
+  An actual application should bin the cardiac phases and use the 
+  registration to correct for respiratory motion only.
+*/
+
+#include "cuCKOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+#include "radial_utilities.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuSenseBuffer.h"
+#include "cuImageOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuCgSolver.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+//
+
+typedef float _real; 
+typedef complext<_real> _complext;
+typedef reald<_real,2>::Type _reald2;
+
+//
+// Define matrix operator for "registration reconstruction" using non-Cartesian Sense
+// For simplicity we assume that the respective operators have been setup from outside
+//
+
+template<class REAL, unsigned int D> class registrationReconOperator
+  : public linearOperator< cuNDArray< complext<REAL> > >
+{
+public:
+  
+  registrationReconOperator() : linearOperator< cuNDArray< complext< REAL> > >() {}
+  virtual ~registrationReconOperator() {}
+  
+  inline void set_encoding_operator( boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D> > E ){
+    E_ = E;
+  }
+  
+  inline void set_resampling_operator( boost::shared_ptr< cuLinearResampleOperator<complext<REAL>,D> > R ){
+    R_ = R;
+  }
+  
+  virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false)
+  {
+    if( !in || !out || !R_->get_displacement_field() ){
+      throw cuda_error("registrationReconOperator::mult_M failed (1)");
+    }
+    
+    // Allocate intermediate image
+    std::vector<size_t> tmp_dims = *R_->get_displacement_field()->get_dimensions(); tmp_dims.pop_back();
+    cuNDArray< complext<REAL> > tmp_in_out;
+
+    tmp_in_out.create(&tmp_dims);
+    
+    // Deform the input image into multiple frames by applying the registration vector field
+    R_->mult_M( in, &tmp_in_out );
+
+    // Apply non-Cartesian Sense encoding
+    E_->mult_M( &tmp_in_out, out );
+  }
+  
+  virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false )
+  {
+    if( !in || !out || !R_->is_preprocessed() ){
+      throw cuda_error("registrationReconOperator::mult_MH failed (1)");
+    }
+    
+    // Allocate intermediate image
+    std::vector<size_t> tmp_dims = *R_->get_displacement_field()->get_dimensions().get(); tmp_dims.pop_back();
+    cuNDArray< complext<REAL> > tmp_in_out(&tmp_dims); 
+
+    // Apply adjoint non-Cartesian Sense encoding
+    E_->mult_MH( in, &tmp_in_out);
+  
+    // Apply adjoint registration
+    R_->mult_MH( &tmp_in_out, out );
+  }
+  
+  virtual void mult_MH_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false )
+  {
+    if( !in || !out || !R_->get_displacement_field() ){
+      throw cuda_error("registrationReconOperator::mult_MH_M failed (1)");
+    }
+
+    // Allocate intermediate image
+    std::vector<size_t> tmp_dims = *R_->get_displacement_field()->get_dimensions().get(); tmp_dims.pop_back();
+    cuNDArray< complext<REAL> > tmp_in_out1(&tmp_dims), tmp_in_out2(&tmp_dims); 
+    
+    // Deform the input image into multiple frames by applying the registration vector field
+    R_->mult_M( in, &tmp_in_out1 );
+
+    // Apply non-Cartesian Sense encoding _iteration_
+    E_->mult_MH_M( &tmp_in_out1, &tmp_in_out2 );
+    
+    // Apply adjoint registration
+    R_->mult_MH( &tmp_in_out2, out );
+  }
+  
+  virtual boost::shared_ptr< linearOperator< cuNDArray< complext<REAL> > > > clone() {
+    return linearOperator< cuNDArray<complext< REAL > > >::clone(this);
+  }
+  
+private:
+  boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D> > E_;
+  boost::shared_ptr< cuLinearResampleOperator<complext<REAL>,D> > R_;
+};
+
+
+//
+// Utility to upload samples for one reconstruction from host to device
+//
+
+boost::shared_ptr< cuNDArray<_complext> > 
+upload_data( unsigned int reconstruction, unsigned int samples_per_reconstruction, unsigned int total_samples_per_coil, unsigned int num_coils, hoNDArray<_complext> *host_data, unsigned int offset = 0 )
+{
+  vector<size_t> dims; dims.push_back(samples_per_reconstruction); dims.push_back(num_coils);
+  cuNDArray<_complext> *data = new cuNDArray<_complext>(); data->create( &dims );
+  for( unsigned int i=0; i<num_coils; i++ )
+    cudaMemcpy( data->get_data_ptr()+i*samples_per_reconstruction, 
+		host_data->get_data_ptr()+i*total_samples_per_coil+reconstruction*samples_per_reconstruction+offset, 
+		samples_per_reconstruction*sizeof(_complext), cudaMemcpyHostToDevice );
+
+  return boost::shared_ptr< cuNDArray<_complext> >(data);
+}
+
+int main(int argc, char** argv)
+{
+
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'd', COMMAND_LINE_STRING, 1, "MRI sample data file name", true, "fb_data.cplx" );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Reconstruction result file name", true, "result.real" );
+
+  // Parameters for the initial Sense reconstruction
+  //
+
+  parms.add_parameter( 'm', COMMAND_LINE_INT,    1, "Matrix size", true, "256" );
+  parms.add_parameter( 'o', COMMAND_LINE_INT,    1, "Oversampled matrix size", true, "384" );
+  parms.add_parameter( 'p', COMMAND_LINE_INT,    1, "Profiles per frame", true, "16" );
+  parms.add_parameter( 'i', COMMAND_LINE_INT,    1, "Number of iterations", true, "15" );
+  parms.add_parameter( 'k', COMMAND_LINE_FLOAT,  1, "Kernel width", true, "5.5" );
+  parms.add_parameter( 'K', COMMAND_LINE_FLOAT,  1, "Kappa", true, "0.1" );
+
+  // Parameters for the registration
+  //
+
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Registration regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Registration regularization weight (beta)", true, "1.0" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  GPUTimer *timer = new GPUTimer("\nPerforming Sense reconstruction");
+
+  //
+  // First perform the Sense reconstruction, 
+  // resulting in aliased presumably...
+  //
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_complext> > host_data = read_nd_array<_complext>((char*)parms.get_parameter('d')->get_string_value());
+   
+  if( !(host_data->get_number_of_dimensions() == 3) ){
+    cout << endl << "Input data is not three-dimensional (#samples/profile x #profiles x #coils). Quitting!" << endl;
+    return 1;
+  }
+  
+  // Configuration from the host data
+  //
+
+  unsigned int samples_per_profile = host_data->get_size(0);
+  unsigned int num_profiles = host_data->get_size(1);
+  unsigned int num_coils = host_data->get_size(2);
+  
+  // Configuration from the command line
+  //
+
+  uint64d2 matrix_size = uint64d2(parms.get_parameter('m')->get_int_value(), parms.get_parameter('m')->get_int_value());
+  uint64d2 matrix_size_os = uint64d2(parms.get_parameter('o')->get_int_value(), parms.get_parameter('o')->get_int_value());
+  _real kernel_width = parms.get_parameter('k')->get_float_value();
+  _real kappa = parms.get_parameter('K')->get_float_value();
+  unsigned int num_iterations = parms.get_parameter('i')->get_int_value();
+  unsigned int profiles_per_frame = parms.get_parameter('p')->get_int_value();
+  unsigned int frames_per_reconstruction = 1;
+
+  // Silent correction of invalid command line parameters (clamp to valid range)
+  //
+
+  if( profiles_per_frame > num_profiles ) profiles_per_frame = num_profiles;
+  if( frames_per_reconstruction < 0 ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  if( frames_per_reconstruction*profiles_per_frame > num_profiles ) frames_per_reconstruction = num_profiles / profiles_per_frame;
+  
+  unsigned int profiles_per_reconstruction = frames_per_reconstruction*profiles_per_frame;
+  unsigned int samples_per_frame = profiles_per_frame*samples_per_profile;
+  unsigned int samples_per_reconstruction = profiles_per_reconstruction*samples_per_profile;
+
+  // Set density compensation weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > dcw = compute_radial_dcw_golden_ratio_2d
+    ( samples_per_profile, profiles_per_frame, (_real)matrix_size_os.vec[0]/(_real)matrix_size.vec[0], 
+      _real(1)/((_real)samples_per_profile/(_real)max(matrix_size.vec[0],matrix_size.vec[1])) );
+
+  // Define encoding matrix for non-Cartesian SENSE
+  //
+
+  boost::shared_ptr< cuNonCartesianSenseOperator<_real,2> > E( new cuNonCartesianSenseOperator<_real,2>() );  
+  E->setup( matrix_size, matrix_size_os, kernel_width );
+  
+  std::vector<size_t> tmp_vec = to_std_vector(matrix_size);
+  tmp_vec.push_back(frames_per_reconstruction);
+  E->set_domain_dimensions( &tmp_vec );
+
+  // Notify encoding operator of dcw
+  //
+  
+  E->set_dcw(dcw);
+  
+  // Define rhs buffer
+  //
+
+  boost::shared_ptr< cuSenseBuffer<_real,2> > rhs_buffer( new cuSenseBuffer<_real,2>() );
+  rhs_buffer->setup( matrix_size, matrix_size_os, kernel_width, num_coils, 8, 16 );
+  rhs_buffer->set_dcw(dcw);
+  
+  // Fill rhs buffer (go through all the data...)
+  //
+    
+  for( unsigned int iteration = 0; iteration < num_profiles/profiles_per_frame; iteration++ ) {
+
+    // Define trajectories
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, 1, iteration*profiles_per_frame );
+    
+    // Upload data
+    boost::shared_ptr< cuNDArray<_complext> > csm_data = upload_data
+      ( iteration, samples_per_frame, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Add frame to rhs buffer
+    rhs_buffer->add_frame_data( csm_data.get(), traj.get() );
+  }
+
+  // Estimate CSM
+  //
+
+  boost::shared_ptr< cuNDArray<_complext> > acc_images = rhs_buffer->get_accumulated_coil_images();
+  boost::shared_ptr< cuNDArray<_complext> > csm = estimate_b1_map<_real,2>( acc_images.get() );
+
+  E->set_csm(csm);
+
+  // Define regularization image operator 
+  //
+
+  std::vector<size_t> image_dims = to_std_vector(matrix_size);
+  cuNDArray<_complext> *regul_image = new cuNDArray<_complext>(&image_dims);
+  
+  E->mult_csm_conj_sum( acc_images.get(), regul_image );
+  acc_images.reset();
+
+  boost::shared_ptr< cuImageOperator<_complext> > R( new cuImageOperator<_complext>() ); 
+  R->set_weight( kappa );
+  R->compute( regul_image ); 
+  delete regul_image; regul_image = 0x0;
+
+  // Define preconditioning weights
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > _precon_weights = sum(abs_square(csm.get()).get(),2);
+  boost::shared_ptr< cuNDArray<_real> > R_diag = R->get();
+  *R_diag *= kappa;
+  *_precon_weights += *R_diag;
+  R_diag.reset();
+  reciprocal_sqrt_inplace(_precon_weights.get());
+  boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+  _precon_weights.reset();
+
+  // Define preconditioning matrix
+  //
+
+  boost::shared_ptr< cuCgPreconditioner<_complext> > D( new cuCgPreconditioner<_complext>() );
+  D->set_weights( precon_weights );
+  precon_weights.reset();
+  csm.reset();
+  
+  // Setup radial SENSE reconstructions (conjugate gradient solver)
+  //
+      
+  cuCgSolver<_complext> *cg = new cuCgSolver<_complext>;
+  cg->set_encoding_operator( E );  // encoding matrix
+  cg->add_regularization_operator( R );  // regularization matrix
+  cg->set_preconditioner ( D );  // preconditioning matrix
+  cg->set_max_iterations( num_iterations );
+  cg->set_tc_tolerance( 1e-6 );
+  cg->set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  // To save memory we allow only a certain number of frames
+  unsigned int max_num_frames = 25;
+  unsigned int reconstruction_offset = 100; // To find some respiratory movement in the test dataset
+  unsigned int num_reconstructions = num_profiles / profiles_per_reconstruction;
+  if( num_reconstructions<(max_num_frames+reconstruction_offset) ) reconstruction_offset = 0;
+  if( num_reconstructions > max_num_frames ) num_reconstructions = max_num_frames;
+  
+  // Allocate space for aliased reconstruction
+  //
+  
+  image_dims = to_std_vector(matrix_size); 
+  image_dims.push_back(frames_per_reconstruction*num_reconstructions); 
+  cuNDArray<_complext> *sense_result_cplx = new cuNDArray<_complext>; 
+  std::cout << std::endl << matrix_size[0] << " " << matrix_size[1] << " " << frames_per_reconstruction << " " << num_reconstructions;
+
+  sense_result_cplx->create(&image_dims);
+  
+  // Loop and reconstruct 
+  // 
+
+  for( unsigned int reconstruction = 0; reconstruction<num_reconstructions; reconstruction++ ){
+    
+    // Determine trajectories
+    //
+
+    boost::shared_ptr< cuNDArray<_reald2> > traj = compute_radial_trajectory_golden_ratio_2d<_real>
+      ( samples_per_profile, profiles_per_frame, frames_per_reconstruction, (reconstruction+reconstruction_offset)*profiles_per_reconstruction );
+    
+    // Upload data
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+      ( reconstruction+reconstruction_offset, samples_per_reconstruction, num_profiles*samples_per_profile, num_coils, host_data.get() );
+    
+    // Set current trajectory and trigger NFFT preprocessing
+    //
+    
+    E->preprocess(traj.get());
+    
+    // Form rhs (use sense_result_cplx array to save memory)
+    //
+    
+    vector<size_t> rhs_dims = to_std_vector(matrix_size); 
+    rhs_dims.push_back(frames_per_reconstruction);
+    cuNDArray<_complext> rhs; 
+
+    rhs.create( &rhs_dims, sense_result_cplx->get_data_ptr()+
+		reconstruction*prod(matrix_size)*frames_per_reconstruction );
+
+    E->mult_MH( data.get(), &rhs );
+    
+    // Conjugate gradient solver
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > cgresult = cg->solve(data.get());
+    rhs = *(cgresult.get());
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > sense_result = abs(sense_result_cplx);
+  write_nd_array<_complext>(sense_result_cplx->to_host().get(), "_images_all.cplx");
+
+  // We need all our device memory for the registration. Clean up after Sense.
+  // E.reset(); D.reset(); R.reset();   -- we will reuse these below 
+
+  rhs_buffer.reset();
+  delete cg; delete sense_result_cplx;
+  delete timer;
+
+  // Determine fixed/moving image dimensions and create arrays
+  //
+
+#ifdef PAD_Z
+  std::vector<size_t> _3d_dims = *(sense_result->get_dimensions());
+  unsigned int last_dim = _3d_dims.back();
+  _3d_dims.pop_back(); _3d_dims.push_back(1); _3d_dims.push_back(last_dim);
+  sense_result->reshape( &_3d_dims );
+#endif
+  
+  vector<size_t> multi_dims = *sense_result->get_dimensions();
+  multi_dims.pop_back();
+#ifdef PAD_Z
+  multi_dims.push_back(sense_result->get_size(3)-1);
+#else
+  multi_dims.push_back(sense_result->get_size(2)-1);
+#endif
+  vector<size_t> single_dims = *sense_result->get_dimensions();
+  single_dims.pop_back();
+  
+  cuNDArray<_real> 
+    *multi_image = new cuNDArray<_real>, 
+    *single_image = new cuNDArray<_real>;
+  
+  single_image->create( &single_dims, sense_result->get_data_ptr());
+  multi_image->create( &multi_dims, sense_result->get_data_ptr()+prod(matrix_size));
+  
+  write_nd_array<_real>(multi_image->to_host().get(), "_images_multi.real");
+  write_nd_array<_real>(single_image->to_host().get(), "_image_single.real");
+
+  // Setup registration solver
+  //
+#ifdef PAD_Z
+  cuCKOpticalFlowSolver<_real,3> *CK = new cuCKOpticalFlowSolver<_real,3>;
+#else
+  cuCKOpticalFlowSolver<_real,2> *CK = new cuCKOpticalFlowSolver<_real,2>;
+#endif
+
+  //CK->set_output_mode( cuCKOpticalFlowSolver<_real,2>::OUTPUT_VERBOSE );  
+  CK->set_num_multires_levels( 1 );
+  CK->set_max_num_iterations_per_level( 500 );
+  CK->set_alpha((_real) parms.get_parameter('a')->get_float_value());
+  CK->set_beta((_real) parms.get_parameter('b')->get_float_value());
+  CK->set_limit(0.01f);
+  
+  // 
+  // Peform "averaging by registration" type reconstruction
+  //
+
+  timer = new GPUTimer("\nReconstruction by optical flow averaging");
+
+  // Run registration:
+  // - multi_image -> single_image (many to one registration)
+  // 
+
+  // All to one
+  boost::shared_ptr< cuNDArray<_real> > reg_result = CK->solve( single_image, multi_image );
+  
+  write_nd_array<_real>(reg_result->to_host().get(), "_reg1.real");
+
+  // Deform the multi_image according to the deformation field and average
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > regis_image = CK->deform( multi_image, reg_result );
+#ifdef PAD_Z
+  boost::shared_ptr< cuNDArray<_real> > regis_image_avg = sum<_real>( regis_image.get(), 3); 
+#else
+  boost::shared_ptr< cuNDArray<_real> > regis_image_avg = sum<_real>( regis_image.get(), 2); 
+#endif
+  write_nd_array<_real>(regis_image->to_host().get(), "_reg_avg.real");
+  write_nd_array<_real>(regis_image_avg->to_host().get(), "_avg_recon.real");
+
+  regis_image.reset(); regis_image_avg.reset(); reg_result.reset();
+
+  delete timer;
+
+  //
+  // Peform "registration in cost function" type reconstruction
+  //
+
+  timer = new GPUTimer("\nRunning registration recon");
+
+  // One to all
+  reg_result = CK->solve( multi_image, single_image );
+  
+  write_nd_array<_real>(reg_result->to_host().get(), "_reg2.real");
+
+  regis_image = CK->deform( single_image, reg_result );
+  write_nd_array<_real>(regis_image->to_host().get(), "_multi_def.real");
+  regis_image.reset(); 
+
+  // Test iteration
+  cuNDArray<_real> out; out.create(multi_image->get_dimensions().get());
+  cuNDArray<_real> in; in.create(single_image->get_dimensions().get());
+  
+  // Release memory
+  delete CK;
+  exit(1);
+  // Setup solver
+  //
+
+  // The non-Cartesian Sense operator is already setup, 
+  // but the trajectories must be recomputed and preprocessed
+
+  boost::shared_ptr< cuNDArray<_reald2> >traj = compute_radial_trajectory_golden_ratio_2d<_real>
+    ( samples_per_profile, profiles_per_frame, frames_per_reconstruction*(num_reconstructions-1), 
+      (1+reconstruction_offset)*profiles_per_reconstruction );
+  
+  E->preprocess(traj.get());
+
+  // Define and preprocess resampling operator
+  
+  boost::shared_ptr< cuLinearResampleOperator<_complext,2> > resampler
+    ( new cuLinearResampleOperator<_complext,2> );
+
+  resampler->set_displacement_field(reg_result);
+  resampler->mult_MH_preprocess();
+
+  // Define registrationReconstruction encoding operator
+
+  boost::shared_ptr< registrationReconOperator<_real,2> > 
+    RR( new registrationReconOperator<_real,2>() );  
+
+  std::vector<size_t> rhs_dims = to_std_vector(matrix_size); 
+  RR->set_domain_dimensions( &rhs_dims );
+
+  RR->set_encoding_operator( E );
+  RR->set_resampling_operator( resampler );
+
+  cg = new cuCgSolver<_complext>;
+  cg->set_encoding_operator( RR );
+  cg->add_regularization_operator( R );
+  cg->set_preconditioner ( D ); 
+  cg->set_max_iterations( num_iterations );
+  cg->set_tc_tolerance( 1e-6 );
+  cg->set_output_mode( cuCgSolver<_complext>::OUTPUT_VERBOSE );
+
+  // Form rhs
+  
+  boost::shared_ptr< cuNDArray<_complext> > data = upload_data
+    ( 0, samples_per_reconstruction*(num_reconstructions-1), 
+      num_profiles*samples_per_profile, num_coils, host_data.get(), 
+      (reconstruction_offset+1)*samples_per_reconstruction );
+  
+  cuNDArray<_complext> rhs(&rhs_dims); 
+  RR->mult_MH( data.get(), &rhs );
+  
+  write_nd_array<_complext>(rhs.to_host().get(), "_rhs.cplx" );
+  write_nd_array<_real>(abs(&rhs)->to_host().get(), "_rhs.real" );
+ 
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > cgresult = cg->solve(data.get());
+
+  boost::shared_ptr< hoNDArray<_real> > host_image = abs(cgresult.get())->to_host();
+  write_nd_array<_real>(host_image.get(), "_reg_frame.real" );
+  
+  delete timer;
+   
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/3d/CMakeLists.txt b/apps/standalone/gpu/registration/3d/CMakeLists.txt
new file mode 100644
index 0000000..78abd8c
--- /dev/null
+++ b/apps/standalone/gpu/registration/3d/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_executable(register_CK_3d_gpu register_CK_3d.cpp)
+
+target_link_libraries(register_CK_3d_gpu
+  hostutils 
+  gpureg 
+  gpucore 
+  gpuoperators
+  gpusolvers 
+  ${CUDA_LIBRARIES}
+  )
+
+install(TARGETS register_CK_3d_gpu DESTINATION bin)
diff --git a/apps/standalone/gpu/registration/3d/register_CK_3d.cpp b/apps/standalone/gpu/registration/3d/register_CK_3d.cpp
new file mode 100644
index 0000000..6700724
--- /dev/null
+++ b/apps/standalone/gpu/registration/3d/register_CK_3d.cpp
@@ -0,0 +1,124 @@
+/*
+  An example of how to register two 3d volumes using Cornelius-Kanade optical flow
+*/
+
+// Gadgetron includes
+#include "cuCKOpticalFlowSolver.h"
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "parameterparser.h"
+
+// Std includes
+#include <iostream>
+
+using namespace Gadgetron;
+using namespace std;
+
+// Define desired precision
+typedef float _real; 
+
+int main(int argc, char** argv)
+{
+  //
+  // Parse command line
+  //
+
+  ParameterParser parms;
+  parms.add_parameter( 'f', COMMAND_LINE_STRING, 1, "Fixed image file name (.real)", true );
+  parms.add_parameter( 'm', COMMAND_LINE_STRING, 1, "Moving image file name (.real)", true );
+  parms.add_parameter( 'r', COMMAND_LINE_STRING, 1, "Result file name", true, "displacement_field.real" );
+  parms.add_parameter( 'a', COMMAND_LINE_FLOAT,  1, "Regularization weight (alpha)", true, "0.05" );
+  parms.add_parameter( 'b', COMMAND_LINE_FLOAT,  1, "Regularization weight (beta)", true, "1.0" );
+  parms.add_parameter( 'l', COMMAND_LINE_INT,    1, "Number of multiresolution levels", true, "3" );
+  
+  parms.parse_parameter_list(argc, argv);
+  if( parms.all_required_parameters_set() ){
+    cout << " Running registration with the following parameters: " << endl;
+    parms.print_parameter_list();
+  }
+  else{
+    cout << " Some required parameters are missing: " << endl;
+    parms.print_parameter_list();
+    parms.print_usage();
+    return 1;
+  }
+  
+  // Load sample data from disk
+  //
+  
+  boost::shared_ptr< hoNDArray<_real> > host_fixed = 
+    read_nd_array<_real>((char*)parms.get_parameter('f')->get_string_value());
+
+  boost::shared_ptr< hoNDArray<_real> > host_moving = 
+    read_nd_array<_real>((char*)parms.get_parameter('m')->get_string_value());
+  
+  if( !host_fixed.get() || !host_moving.get() ){
+    cout << endl << "One of the input images is not found. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  unsigned int num_fixed_dims = host_fixed->get_number_of_dimensions();
+  unsigned int num_moving_dims = host_moving->get_number_of_dimensions();
+
+  if( !(num_fixed_dims == 3 || num_fixed_dims == 4)  ){
+    cout << endl << "The fixed image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  if( !(num_moving_dims == 3 || num_moving_dims == 4)  ){
+    cout << endl << "The moving image is not three- or four-dimensional. Quitting!\n" << endl;
+    return 1;
+  }
+
+  // Upload host data to device
+  //
+
+  cuNDArray<_real> fixed_image(host_fixed.get());
+  cuNDArray<_real> moving_image(host_moving.get());
+  
+  _real alpha = (_real) parms.get_parameter('a')->get_float_value();
+  _real beta = (_real) parms.get_parameter('b')->get_float_value();
+
+  unsigned int multires_levels = parms.get_parameter('l')->get_int_value();
+
+  // Use trilinear interpolation for resampling
+  //
+
+  boost::shared_ptr< cuLinearResampleOperator<_real,3> > R( new cuLinearResampleOperator<_real,3>() );
+
+  // Setup solver
+  //
+  
+  cuCKOpticalFlowSolver<_real,3> CK;
+  CK.set_interpolator( R );
+  CK.set_output_mode( cuCKOpticalFlowSolver<_real,3>::OUTPUT_VERBOSE );  
+  CK.set_max_num_iterations_per_level( 500 );
+  CK.set_num_multires_levels( multires_levels );
+  CK.set_alpha(alpha);
+  CK.set_beta(beta);
+  CK.set_limit(0.01f);
+  
+  // Run registration
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > result = CK.solve( &fixed_image, &moving_image/*, true*/ );
+
+  if( !result.get() ){
+    cout << endl << "Registration solver failed. Quitting!\n" << endl;
+    return 1;
+  }
+  
+  boost::shared_ptr< cuNDArray<_real> > deformed_moving = CK.deform( &moving_image, result );
+  
+  // All done, write out the result
+  //
+
+  boost::shared_ptr< hoNDArray<_real> > host_result = result->to_host();
+  write_nd_array<_real>(host_result.get(), (char*)parms.get_parameter('r')->get_string_value());
+
+  host_result = deformed_moving->to_host();
+  write_nd_array<_real>(host_result.get(), "def_moving.real" );
+  
+  return 0;
+}
diff --git a/apps/standalone/gpu/registration/CMakeLists.txt b/apps/standalone/gpu/registration/CMakeLists.txt
new file mode 100644
index 0000000..4b458f1
--- /dev/null
+++ b/apps/standalone/gpu/registration/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/gpu
+  )
+
+add_subdirectory(2d)
+add_subdirectory(3d)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
new file mode 100644
index 0000000..dc63e8b
--- /dev/null
+++ b/cmake/CMakeLists.txt
@@ -0,0 +1,15 @@
+install(FILES 	
+FindACE.cmake
+FindCULA.cmake
+FindFFTW3.cmake
+FindGLEW.cmake
+FindNumPy.cmake
+FindTinyXML.cmake
+FindGadgetron.cmake
+FindXSD.cmake
+FindXercesC.cmake
+FindXalanC.cmake
+FindIsmrmrd.cmake
+FindGMatlab.cmake
+FindDCMTK.cmake
+DESTINATION cmake)
diff --git a/cmake/FindACE.cmake b/cmake/FindACE.cmake
new file mode 100644
index 0000000..3eed289
--- /dev/null
+++ b/cmake/FindACE.cmake
@@ -0,0 +1,90 @@
+#
+# Find the ACE client includes and library
+#
+
+# This module defines
+# ACE_INCLUDE_DIR, where to find ace.h
+# ACE_LIBRARIES, the libraries to link against ... !! NOT header is old !! ...
+# ACE_FOUND, if false, you cannot build anything that requires ACE
+
+# This is the new header...
+
+######################################################################## 
+## check pkg-config for ace information, if available 
+ 
+SET(ACE_INCLUDE_DIR_GUESS) 
+SET(ACE_LIBRARY_DIR_GUESS) 
+SET(ACE_LINK_FLAGS) 
+IF(PKGCONFIG_EXECUTABLE) 
+	PKGCONFIG(ace ACE_INCLUDE_DIR_GUESS ACE_LIBRARY_DIR_GUESS ACE_LINK_FLAGS ACE_C_FLAGS) 
+	IF (NOT ACE_LINK_FLAGS) 
+		PKGCONFIG(ACE ACE_INCLUDE_DIR_GUESS ACE_LIBRARY_DIR_GUESS ACE_LINK_FLAGS ACE_C_FLAGS) 
+	ENDIF (NOT ACE_LINK_FLAGS) 
+	ADD_DEFINITIONS(${ACE_C_FLAGS}) 
+ENDIF(PKGCONFIG_EXECUTABLE) 
+ 
+SET(ACE_LINK_FLAGS "${ACE_LINK_FLAGS}" CACHE INTERNAL "ace link flags") 
+ 
+######################################################################## 
+##  general find 
+ 
+FIND_PATH(ACE_INCLUDE_DIR ace/ACE.h ${CMAKE_SOURCE_DIR}/../ACE_wrappers/ /usr/include /usr/local/include $ENV{ACE_ROOT} $ENV{ACE_ROOT}/include DOC "directory containing ace/*.h for ACE library") 
+ 
+# in YARP1, config was in another directory 
+SET(ACE_INCLUDE_CONFIG_DIR "" CACHE STRING "location of ace/config.h") 
+MARK_AS_ADVANCED(ACE_INCLUDE_CONFIG_DIR) 
+ 
+FIND_LIBRARY(ACE_LIBRARY NAMES ACE ace PATHS ${CMAKE_SOURCE_DIR}/../ACE_wrappers/lib/ /usr/lib /usr/local/lib $ENV{ACE_ROOT}/lib $ENV{ACE_ROOT} DOC "ACE library file") 
+ 
+IF (WIN32 AND NOT CYGWIN) 
+	SET(CMAKE_DEBUG_POSTFIX "d") 
+	FIND_LIBRARY(ACE_DEBUG_LIBRARY NAMES ACE${CMAKE_DEBUG_POSTFIX} ace${CMAKE_DEBUG_POSTFIX} PATHS ${CMAKE_SOURCE_DIR}/../ACE_wrappers/lib/ /usr/lib /usr/local/lib $ENV{ACE_ROOT}/lib $ENV{ACE_ROOT} DOC "ACE library file (debug version)") 
+ENDIF (WIN32 AND NOT CYGWIN) 
+ 
+ 
+######################################################################## 
+## OS-specific extra linkage 
+ 
+# Solaris needs some extra libraries that may not have been found already 
+IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") 
+  MESSAGE(STATUS "need to link solaris-specific libraries") 
+  #  LINK_LIBRARIES(socket rt) 
+  SET(ACE_LIBRARY ${ACE_LIBRARY} socket rt nsl) 
+ENDIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") 
+ 
+# Windows needs some extra libraries 
+IF (WIN32 AND NOT CYGWIN) 
+  MESSAGE(STATUS "need to link windows-specific libraries") 
+  #LINK_LIBRARIES(winmm) 
+  SET(ACE_LIBRARY ${ACE_LIBRARY} winmm) 
+ENDIF (WIN32 AND NOT CYGWIN) 
+ 
+ 
+######################################################################## 
+## finished - now just set up flags and complain to user if necessary 
+ 
+IF (ACE_INCLUDE_DIR AND ACE_LIBRARY) 
+	SET(ACE_FOUND TRUE) 
+ELSE (ACE_INCLUDE_DIR AND ACE_LIBRARY) 
+	SET(ACE_FOUND FALSE) 
+ENDIF (ACE_INCLUDE_DIR AND ACE_LIBRARY) 
+ 
+IF (ACE_DEBUG_LIBRARY) 
+	SET(ACE_DEBUG_FOUND TRUE) 
+ELSE (ACE_DEBUG_LIBRARY)
+  SET(ACE_DEBUG_LIBRARY ${ACE_LIBRARY})
+ENDIF (ACE_DEBUG_LIBRARY) 
+ 
+IF (ACE_FOUND) 
+	IF (NOT Ace_FIND_QUIETLY) 
+		MESSAGE(STATUS "Found ACE library: ${ACE_LIBRARY}") 
+		MESSAGE(STATUS "Found ACE include: ${ACE_INCLUDE_DIR}") 
+	ENDIF (NOT Ace_FIND_QUIETLY) 
+ELSE (ACE_FOUND) 
+	IF (Ace_FIND_REQUIRED) 
+		MESSAGE(FATAL_ERROR "Could not find ACE") 
+	ENDIF (Ace_FIND_REQUIRED) 
+ENDIF (ACE_FOUND) 
+
+# TSS: backwards compatibility
+SET(ACE_LIBRARIES ${ACE_LIBRARY}) 
diff --git a/cmake/FindCULA.cmake b/cmake/FindCULA.cmake
new file mode 100644
index 0000000..8b5d0f8
--- /dev/null
+++ b/cmake/FindCULA.cmake
@@ -0,0 +1,63 @@
+# - Find CULA
+# Find the native CULA includes and library
+#
+#   CULA_FOUND       - True if CULA found.
+#   CULA_INCLUDE_DIR - where to find cula.h, etc.
+#   CULA_LIBRARIES   - List of libraries when using TinyXML.
+#
+
+IF( CULA_INCLUDE_DIR )
+    # Already in cache, be silent
+    SET( CULA_FIND_QUIETLY TRUE )
+ENDIF( CULA_INCLUDE_DIR )
+
+FIND_PATH( CULA_INCLUDE_DIR "cula.h"
+           PATH_SUFFIXES "cula/include" )
+
+MESSAGE("CULA_INCLUDE_DIR = ${CULA_INCLUDE_DIR}")
+
+
+FIND_LIBRARY( CULA_LIBRARY
+              NAMES "cula"
+              PATH_SUFFIXES "cula/lib64" )
+
+FIND_LIBRARY( CULA_LAPACK_LIBRARY
+              NAMES "cula_lapack"
+              PATH_SUFFIXES "cula/lib64" )
+
+FIND_LIBRARY( CULA_CORE_LIBRARY
+              NAMES "cula_core"
+              PATH_SUFFIXES "cula/lib64" )
+
+FIND_LIBRARY( CULA_LAPACK_BASIC_LIBRARY
+              NAMES "cula_lapack_basic"
+              PATH_SUFFIXES "cula/lib64" )
+
+#This is version 12 of CULA
+if (CULA_LIBRARY)
+  list(APPEND CULA_LIBRARIES ${CULA_LIBRARY})
+endif (CULA_LIBRARY)
+
+#This is version 13 of CULA
+if (CULA_LAPACK_LIBRARY)
+  list(APPEND CULA_LIBRARIES ${CULA_LAPACK_LIBRARY})
+endif (CULA_LAPACK_LIBRARY)
+
+#This is version 13 of CULA
+if (CULA_CORE_LIBRARY)
+  list(APPEND CULA_LIBRARIES ${CULA_CORE_LIBRARY})
+endif (CULA_CORE_LIBRARY)
+
+#This is version 17 of CULA
+if (CULA_LAPACK_BASIC_LIBRARY)
+  list(APPEND CULA_LIBRARIES ${CULA_LAPACK_BASIC_LIBRARY})
+endif (CULA_LAPACK_BASIC_LIBRARY)
+
+MESSAGE("CULA_LIBRARIES = ${CULA_LIBRARIES}")
+
+# handle the QUIETLY and REQUIRED arguments and set CULA_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE( "FindPackageHandleStandardArgs" )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( "CULA" DEFAULT_MSG CULA_INCLUDE_DIR CULA_LIBRARIES )
+
+MARK_AS_ADVANCED( CULA_INCLUDE_DIR CULA_LIBRARIES )
diff --git a/cmake/FindDCMTK.cmake b/cmake/FindDCMTK.cmake
new file mode 100644
index 0000000..f8e4aff
--- /dev/null
+++ b/cmake/FindDCMTK.cmake
@@ -0,0 +1,175 @@
+# - find DCMTK libraries and applications
+#
+
+#  DCMTK_INCLUDE_DIRS   - Directories to include to use DCMTK
+#  DCMTK_LIBRARIES     - Files to link against to use DCMTK
+#  DCMTK_FOUND         - If false, don't try to use DCMTK
+#  DCMTK_DIR           - (optional) Source directory for DCMTK
+#  DCMTK_HOME          - install path for dcmtk binaries/headers/libs
+#
+# DCMTK_DIR can be used to make it simpler to find the various include
+# directories and compiled libraries if you've just compiled it in the
+# source tree. Just set it to the root of the tree where you extracted
+# the source (default to /usr/include/dcmtk/)
+
+#=============================================================================
+# Copyright 2004-2009 Kitware, Inc.
+# Copyright 2009-2010 Mathieu Malaterre <mathieu.malaterre at gmail.com>
+# Copyright 2010 Thomas Sondergaard <ts at medical-insight.com>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+#
+# Written for VXL by Amitha Perera.
+# Upgraded for GDCM by Mathieu Malaterre.
+# Modified for EasyViz by Thomas Sondergaard.
+# Updated to work with non-standard install paths by Joseph Naegele
+#
+
+
+# Allow for non-standard dcmtk installation locations
+# Set the DCMTK_HOME environment variable to make life easier!!
+find_path(DCMTK_HOME include/dcmtk
+    HINTS $ENV{DCMTK_HOME} $ENV{DCMTK_DIR}
+    PATHS /usr /usr/local
+    PATH_SUFFIXES dcmtk)
+mark_as_advanced(DCMTK_HOME)
+
+if(NOT DCMTK_FOUND AND NOT DCMTK_DIR)
+    set(DCMTK_DIR ${DCMTK_HOME})
+    mark_as_advanced(DCMTK_DIR)
+endif()
+
+foreach(lib
+    dcmdata
+    dcmimage
+    dcmimgle
+    dcmjpeg
+    dcmnet
+    dcmpstat
+    dcmqrdb
+    dcmsign
+    dcmsr
+    dcmtls
+    ijg12
+    ijg16
+    ijg8
+    oflog
+    ofstd)
+
+    find_library(DCMTK_${lib}_LIBRARY
+        ${lib}
+        HINTS
+        ${DCMTK_DIR}/lib
+        ${DCMTK_DIR}/${lib}/libsrc
+        ${DCMTK_DIR}/${lib}/libsrc/Release
+        ${DCMTK_DIR}/${lib}/libsrc/Debug
+        ${DCMTK_DIR}/${lib}/Release
+        ${DCMTK_DIR}/${lib}/Debug)
+
+    mark_as_advanced(DCMTK_${lib}_LIBRARY)
+
+    if(DCMTK_${lib}_LIBRARY)
+        list(APPEND DCMTK_LIBRARIES ${DCMTK_${lib}_LIBRARY})
+    endif()
+
+endforeach()
+
+
+set(DCMTK_config_TEST_HEADER osconfig.h)
+set(DCMTK_dcmdata_TEST_HEADER dctypes.h)
+set(DCMTK_dcmimage_TEST_HEADER dicoimg.h)
+set(DCMTK_dcmimgle_TEST_HEADER dcmimage.h)
+set(DCMTK_dcmjpeg_TEST_HEADER djdecode.h)
+set(DCMTK_dcmnet_TEST_HEADER assoc.h)
+set(DCMTK_dcmpstat_TEST_HEADER dcmpstat.h)
+set(DCMTK_dcmqrdb_TEST_HEADER dcmqrdba.h)
+set(DCMTK_dcmsign_TEST_HEADER sicert.h)
+set(DCMTK_dcmsr_TEST_HEADER dsrtree.h)
+set(DCMTK_dcmtls_TEST_HEADER tlslayer.h)
+set(DCMTK_oflog_TEST_HEADER oflog.h)
+set(DCMTK_ofstd_TEST_HEADER ofstdinc.h)
+
+foreach(dir
+    config
+    dcmdata
+    dcmimage
+    dcmimgle
+    dcmjpeg
+    dcmnet
+    dcmpstat
+    dcmqrdb
+    dcmsign
+    dcmsr
+    dcmtls
+    oflog
+    ofstd)
+
+    find_path(DCMTK_${dir}_INCLUDE_DIR
+        ${DCMTK_${dir}_TEST_HEADER}
+        HINTS
+        ${DCMTK_DIR}/include/dcmtk/${dir}
+        ${DCMTK_DIR}/${dir}/include
+        ${DCMTK_DIR}/${dir}
+        ${DCMTK_DIR}/include/${dir}
+        ${DCMTK_DIR}/${dir}/include/dcmtk/${dir}
+    )
+    mark_as_advanced(DCMTK_${dir}_INCLUDE_DIR)
+
+    if(DCMTK_${dir}_INCLUDE_DIR)
+        list(APPEND DCMTK_INCLUDE_DIRS ${DCMTK_${dir}_INCLUDE_DIR})
+    endif()
+endforeach()
+
+if(WIN32)
+    list(APPEND DCMTK_LIBRARIES netapi32 wsock32)
+endif()
+
+if(DCMTK_ofstd_INCLUDE_DIR)
+    get_filename_component(
+        DCMTK_dcmtk_INCLUDE_DIR
+        ${DCMTK_ofstd_INCLUDE_DIR}
+        PATH
+        CACHE)
+    list(APPEND DCMTK_INCLUDE_DIRS ${DCMTK_dcmtk_INCLUDE_DIR})
+    mark_as_advanced(DCMTK_dcmtk_INCLUDE_DIR)
+endif()
+
+if(DCMTK_dcmtk_INCLUDE_DIR)
+    get_filename_component(
+        DCMTK_root_INCLUDE_DIR
+        ${DCMTK_dcmtk_INCLUDE_DIR}
+        PATH
+        CACHE)
+    list(APPEND DCMTK_INCLUDE_DIRS ${DCMTK_root_INCLUDE_DIR})
+    mark_as_advanced(DCMTK_root_INCLUDE_DIR)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(DCMTK DEFAULT_MSG
+  DCMTK_config_INCLUDE_DIR
+  DCMTK_ofstd_INCLUDE_DIR
+  DCMTK_ofstd_LIBRARY
+  DCMTK_oflog_INCLUDE_DIR
+  DCMTK_oflog_LIBRARY
+  DCMTK_dcmdata_INCLUDE_DIR
+  DCMTK_dcmdata_LIBRARY
+  DCMTK_dcmimgle_INCLUDE_DIR
+  DCMTK_dcmimgle_LIBRARY)
+
+# Compatibility: This variable is deprecated
+set(DCMTK_INCLUDE_DIR ${DCMTK_INCLUDE_DIRS})
+
+foreach(executable dcmdump dcmdjpeg dcmdrle)
+  string(TOUPPER ${executable} EXECUTABLE)
+  find_program(DCMTK_${EXECUTABLE}_EXECUTABLE ${executable} ${DCMTK_DIR}/bin)
+  mark_as_advanced(DCMTK_${EXECUTABLE}_EXECUTABLE)
+endforeach()
diff --git a/cmake/FindFFTW3.cmake b/cmake/FindFFTW3.cmake
new file mode 100644
index 0000000..eccfe9f
--- /dev/null
+++ b/cmake/FindFFTW3.cmake
@@ -0,0 +1,93 @@
+# - Try to find FFTW3.
+# Usage: find_package(FFTW3 [COMPONENTS [single double long-double threads]])
+#
+# Variables used by this module:
+#  FFTW3_ROOT_DIR             - FFTW3 root directory
+# Variables defined by this module:
+#  FFTW3_FOUND                - system has FFTW3
+#  FFTW3_INCLUDE_DIR          - the FFTW3 include directory (cached)
+#  FFTW3_INCLUDE_DIRS         - the FFTW3 include directories
+#                               (identical to FFTW3_INCLUDE_DIR)
+#  FFTW3[FL]?_LIBRARY         - the FFTW3 library - double, single(F), 
+#                               long-double(L) precision (cached)
+#  FFTW3[FL]?_THREADS_LIBRARY - the threaded FFTW3 library - double, single(F), 
+#                               long-double(L) precision (cached)
+#  FFTW3_LIBRARIES            - list of all FFTW3 libraries found
+
+# Copyright (C) 2009-2010
+# ASTRON (Netherlands Institute for Radio Astronomy)
+# P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
+#
+# This file is part of the LOFAR software suite.
+# The LOFAR software suite is free software: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# The LOFAR software suite is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
+#
+# $Id: FindFFTW3.cmake 15918 2010-06-25 11:12:42Z loose $
+
+# Use double precision by default.
+if(FFTW3_FIND_COMPONENTS MATCHES "^$")
+  set(_components double)
+else()
+  set(_components ${FFTW3_FIND_COMPONENTS})
+endif()
+
+# Loop over each component.
+set(_libraries)
+foreach(_comp ${_components})
+  if(_comp STREQUAL "single")
+    list(APPEND _libraries fftw3f)
+  elseif(_comp STREQUAL "double")
+    list(APPEND _libraries fftw3)
+  elseif(_comp STREQUAL "long-double")
+    list(APPEND _libraries fftw3l)
+  elseif(_comp STREQUAL "threads")
+    set(_use_threads ON)
+  else(_comp STREQUAL "single")
+    message(FATAL_ERROR "FindFFTW3: unknown component `${_comp}' specified. "
+      "Valid components are `single', `double', `long-double', and `threads'.")
+  endif(_comp STREQUAL "single")
+endforeach(_comp ${_components})
+
+# If using threads, we need to link against threaded libraries as well.
+if(_use_threads)
+  set(_thread_libs)
+  foreach(_lib ${_libraries})
+    list(APPEND _thread_libs ${_lib}_threads)
+  endforeach(_lib ${_libraries})
+  set(_libraries ${_thread_libs} ${_libraries})
+endif(_use_threads)
+
+# Keep a list of variable names that we need to pass on to
+# find_package_handle_standard_args().
+set(_check_list)
+
+# Search for all requested libraries.
+foreach(_lib ${_libraries})
+  string(TOUPPER ${_lib} _LIB)
+  find_library(${_LIB}_LIBRARY ${_lib}
+    HINTS ${FFTW3_ROOT_DIR} PATH_SUFFIXES lib)
+  mark_as_advanced(${_LIB}_LIBRARY)
+  list(APPEND FFTW3_LIBRARIES ${${_LIB}_LIBRARY})
+  list(APPEND _check_list ${_LIB}_LIBRARY)
+endforeach(_lib ${_libraries})
+
+# Search for the header file.
+find_path(FFTW3_INCLUDE_DIR fftw3.h 
+  HINTS ${FFTW3_ROOT_DIR} PATH_SUFFIXES include)
+mark_as_advanced(FFTW3_INCLUDE_DIR)
+list(APPEND _check_list FFTW3_INCLUDE_DIR)
+
+# Handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW3 DEFAULT_MSG ${_check_list})
diff --git a/cmake/FindGLEW.cmake b/cmake/FindGLEW.cmake
new file mode 100644
index 0000000..d9b35e0
--- /dev/null
+++ b/cmake/FindGLEW.cmake
@@ -0,0 +1,53 @@
+# - Try to find GLEW
+# Once done this will define
+#  
+#  GLEW_FOUND        - system has GLEW
+#  GLEW_INCLUDE_DIR  - the GLEW include directory
+#  GLEW_LIBRARY_DIR  - where the libraries are
+#  GLEW_LIBRARY      - Link these to use GLEW
+#   
+
+IF (GLEW_INCLUDE_DIR)
+  # Already in cache, be silent
+  SET(GLEW_FIND_QUIETLY TRUE)
+ENDIF (GLEW_INCLUDE_DIR)
+
+if( WIN32 )
+   if( MSVC80 )
+       set( COMPILER_PATH "C:/Program\ Files/Microsoft\ Visual\ Studio\ 8/VC" )
+   endif( MSVC80 )
+   if( MSVC71 )
+       set( COMPILER_PATH "C:/Program\ Files/Microsoft\ Visual\ Studio\ .NET\ 2003/Vc7" )
+   endif( MSVC71 )
+   FIND_PATH( GLEW_INCLUDE_DIR gl/glew.h gl/wglew.h
+              PATHS c:/glew/include ${COMPILER_PATH}/PlatformSDK/Include )
+   SET( GLEW_NAMES glew32 )
+   FIND_LIBRARY( GLEW_LIBRARY
+                 NAMES ${GLEW_NAMES}
+                 PATHS c:/glew/lib ${COMPILER_PATH}/PlatformSDK/Lib )
+else( WIN32 )
+   FIND_PATH( GLEW_INCLUDE_DIR glew.h wglew.h
+              PATHS /usr/local/include /usr/include
+              PATH_SUFFIXES gl/ GL/ )
+   SET( GLEW_NAMES glew GLEW )
+   FIND_LIBRARY( GLEW_LIBRARY
+                 NAMES ${GLEW_NAMES}
+                 PATHS /usr/lib /usr/local/lib )
+endif( WIN32 )
+
+GET_FILENAME_COMPONENT( GLEW_LIBRARY_DIR ${GLEW_LIBRARY} PATH )
+
+IF (GLEW_INCLUDE_DIR AND GLEW_LIBRARY)
+   SET(GLEW_FOUND TRUE)
+    SET( GLEW_LIBRARY_DIR ${GLEW_LIBRARY} )
+    MESSAGE("GLEW FOUND")
+ELSE (GLEW_INCLUDE_DIR AND GLEW_LIBRARY)
+   SET( GLEW_FOUND FALSE )
+   SET( GLEW_LIBRARY_DIR )
+    MESSAGE("GLEW NOT FOUND")
+ENDIF (GLEW_INCLUDE_DIR AND GLEW_LIBRARY)
+
+MARK_AS_ADVANCED(
+  GLEW_LIBRARY
+  GLEW_INCLUDE_DIR
+)
diff --git a/cmake/FindGMatlab.cmake b/cmake/FindGMatlab.cmake
new file mode 100644
index 0000000..e8a8147
--- /dev/null
+++ b/cmake/FindGMatlab.cmake
@@ -0,0 +1,115 @@
+# - this module looks for Matlab
+# Defines:
+#  MATLAB_INCLUDE_DIR:  include path for mex.h, engine.h
+#  MATLAB_LIBRARIES:    required libraries: libmex, etc
+#  MATLAB_JARS:         optional java jars: jmi.jar, util.jar, etc
+#  MATLAB_MEX_LIBRARY:  path to libmex.lib
+#  MATLAB_MX_LIBRARY:   path to libmx.lib
+#  MATLAB_ENG_LIBRARY:  path to libeng.lib
+
+#=============================================================================
+# Copyright 2005-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+#
+#  Modified April 2013 - Joseph Naegele
+#   - Updated to work on OS X 10._
+#   - Added Matlab's Java Jars as MATLAB_JARS
+
+set(MATLAB_FOUND 0)
+
+if(WIN32)
+    set(MATLAB_ROOT "[HKEY_LOCAL_MACHINE\\SOFTWARE\\MathWorks\\MATLAB\\7.0;MATLABROOT]")
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio 6")
+        set(MATLAB_LIB_ROOT "${MATLAB_ROOT}/extern/lib/win32/microsoft/msvc60")
+    else()
+        if(${CMAKE_GENERATOR} MATCHES "Visual Studio 7")
+            # Assume people are generally using 7.1,
+            # if using 7.0 need to link to: ../extern/lib/win32/microsoft/msvc70
+            set(MATLAB_LIB_ROOT "${MATLAB_ROOT}/extern/lib/win32/microsoft/msvc71")
+        else()
+            if(${CMAKE_GENERATOR} MATCHES "Borland")
+                # Same here, there are also: bcc50 and bcc51 directories
+                set(MATLAB_LIB_ROOT "${MATLAB_ROOT}/extern/lib/win32/microsoft/bcc54")
+            else()
+                if(MATLAB_FIND_REQUIRED)
+                    message(FATAL_ERROR "Generator not compatible: ${CMAKE_GENERATOR}")
+                endif()
+            endif()
+        endif()
+    endif()
+    find_path(
+        MATLAB_INCLUDE_DIR
+        "mex.h"
+        HINTS ${MATLAB_ROOT}
+        PATH_SUFFIXES extern/include
+    )
+else(WIN32)
+    find_path(
+        MATLAB_ROOT extern/include/mex.h
+        HINTS $ENV{MATLAB_HOME} $ENV{MATLAB_ROOT}
+        PATHS /usr /usr/local /opt
+        PATH_SUFFIXES MATLAB
+    )
+    find_path(
+        MATLAB_INCLUDE_DIR
+        "mex.h"
+        HINTS ${MATLAB_ROOT}
+        PATH_SUFFIXES extern/include
+    )
+endif(WIN32)
+
+# find each library
+# give it it's own cmake variable
+# add it to the list of libraries
+foreach(lib mat ut mex mx eng)
+    string(TOUPPER ${lib} LIB)
+    find_library(
+        MATLAB_${LIB}_LIBRARY
+        ${lib}
+        HINTS ${MATLAB_ROOT} ${MATLAB_LIB_ROOT}
+        PATH_SUFFIXES lib bin bin/maci64 bin/glnxa64 bin/glnxa86
+    )
+    if(MATLAB_${LIB}_LIBRARY)
+        list(APPEND MATLAB_LIBRARIES "${MATLAB_${LIB}_LIBRARY}")
+    endif()
+    mark_as_advanced(MATLAB_${LIB}_LIBRARY)
+endforeach()
+
+foreach(jar jmi util)
+    string(TOUPPER ${jar} LIB)
+    find_file(
+        MATLAB_${LIB}_JAR
+        "${jar}.jar"
+        HINTS ${MATLAB_ROOT}
+        PATH_SUFFIXES java jar java/jar
+    )
+    if(MATLAB_${LIB}_JAR)
+        list(APPEND MATLAB_JARS "${MATLAB_${LIB}_JAR}")
+    endif()
+    mark_as_advanced(MATLAB_${LIB}_JAR)
+endforeach()
+
+if(MATLAB_INCLUDE_DIR AND MATLAB_LIBRARIES)
+    set(MATLAB_FOUND 1)
+endif()
+
+include("FindPackageHandleStandardArgs")
+FIND_PACKAGE_HANDLE_STANDARD_ARGS("Matlab" DEFAULT_MSG MATLAB_ROOT MATLAB_INCLUDE_DIR MATLAB_LIBRARIES)
+
+mark_as_advanced(
+    MATLAB_JARS
+    MATLAB_LIBRARIES
+    MATLAB_INCLUDE_DIR
+    MATLAB_FOUND
+    MATLAB_ROOT
+)
+
diff --git a/cmake/FindGadgetron.cmake b/cmake/FindGadgetron.cmake
new file mode 100644
index 0000000..1ae4a1c
--- /dev/null
+++ b/cmake/FindGadgetron.cmake
@@ -0,0 +1,40 @@
+#
+# Find the Gadgetron Installation
+#
+
+# This module defines
+# GADGETRON_INCLUDE_DIR, where to finds Gadget.h
+# GADGETRON_HOME, Gadgetron Root Dir
+# GADGETRON_LIB_DIR, This is where all the installed gadgetron libraries live
+# GADGETRON_FOUND, if false, you cannot build anything that requires ACE
+
+# Keep a list of variable names that we need to pass on to
+# find_package_handle_standard_args().
+set(_check_list)
+
+# Search for the header file.
+find_path(GADGETRON_HOME include/Gadget.h 
+  HINTS $ENV{GADGETRON_HOME} /usr/local/gadgetron /usr/gadgetron)
+mark_as_advanced(GADGETRON_HOME)
+list(APPEND _check_list GADGETRON_HOME)
+
+SET(GADGETRON_INCLUDE_DIR ${GADGETRON_HOME}/include)
+mark_as_advanced(GADGETRON_INCLUDE_DIR)
+list(APPEND _check_list GADGETRON_INCLUDE_DIR)
+
+SET(GADGETRON_LIB_DIR ${GADGETRON_HOME}/lib)
+mark_as_advanced(GADGETRON_LIB_DIR)
+list(APPEND _check_list GADGETRON_LIB_DIR)
+
+# Handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Gadgetron DEFAULT_MSG ${_check_list})
+
+# If Cuda is detected on the system some header files will be needed
+# -- whether Cuda is used or not --
+
+find_package(CUDA)
+if (CUDA_FOUND)
+  include_directories( ${CUDA_INCLUDE_DIRS} )
+endif (CUDA_FOUND)
diff --git a/cmake/FindIsmrmrd.cmake b/cmake/FindIsmrmrd.cmake
new file mode 100644
index 0000000..07a3d3a
--- /dev/null
+++ b/cmake/FindIsmrmrd.cmake
@@ -0,0 +1,29 @@
+# - Find ISMRMRRD
+#   ISMRMRD_FOUND            - true if an ISMRMRD installation is found.
+#   ISMRMRD_INCLUDE_DIR      - where to find ismrmrd.h, etc.
+#   ISMRMRD_LIBRARIES        - libismrmrd.so.
+#   ISMRMRD_XSD_INCLUDE_DIR  - folder containing ismrmrd.hxx (autogenerated from xsd schema)
+#   ISMRMRD_XSD_SOURCE       - full path to ismrmrd.cxx (autogenerated from xsd schema)
+#   ISMRMRD_SCHEMA_DIR       - where to find ismrmrd.xsd       
+
+FIND_PATH( ISMRMRD_INCLUDE_DIR ismrmrd.h 
+HINTS $ENV{ISMRMRD_HOME} PATHS /usr/local /usr PATH_SUFFIXES include ismrmrd ismrmrd/include)
+
+FIND_PATH( ISMRMRD_XSD_INCLUDE_DIR ismrmrd.hxx
+HINTS $ENV{ISMRMRD_HOME} PATHS /usr/local /usr PATH_SUFFIXES schema ismrmrd ismrmrd/schema)
+
+FIND_PATH( ISMRMRD_SCHEMA_DIR ismrmrd.xsd 
+HINTS $ENV{ISMRMRD_HOME} PATHS /usr/local /usr PATH_SUFFIXES schema ismrmrd ismrmrd/schema)
+
+FIND_LIBRARY( ISMRMRD_LIBRARIES
+              NAMES "ismrmrd"
+              PATHS  /usr/local/lib ${ISMRMRD_INCLUDE_DIR}/../lib /usr/lib )
+
+FIND_FILE( ISMRMRD_XSD_SOURCE
+           NAMES "ismrmrd.cxx"
+           HINTS $ENV{ISMRMRD_HOME} PATHS /usr/local /usr PATH_SUFFIXES schema ismrmrd ismrmrd/schema)
+
+INCLUDE( "FindPackageHandleStandardArgs" )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( "Ismrmrd" DEFAULT_MSG ISMRMRD_INCLUDE_DIR ISMRMRD_LIBRARIES ISMRMRD_SCHEMA_DIR)
+
+MARK_AS_ADVANCED( ISMRMRD_INCLUDE_DIR ISMRMRD_LIBRARIES ISMRMRD_SCHEMA_DIR)
diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
new file mode 100644
index 0000000..c73317d
--- /dev/null
+++ b/cmake/FindMKL.cmake
@@ -0,0 +1,99 @@
+# - Find the MKL libraries
+# Modified from Armadillo's ARMA_FindMKL.cmake
+# This module defines
+#  MKL_INCLUDE_DIR, the directory for the MKL headers
+#  MKL_LIB_DIR, the directory for the MKL library files
+#  MKL_COMPILER_LIB_DIR, the directory for the MKL compiler library files
+#  MKL_LIBRARIES, the libraries needed to use Intel's implementation of BLAS & LAPACK.
+#  MKL_FOUND, If false, do not try to use MKL; if true, the macro definition USE_MKL is added.
+
+# Set the include path
+# TODO: what if MKL is not installed in /opt/intel/mkl?
+# try to find at /opt/intel/mkl
+# in windows, try to find MKL at C:/Program Files (x86)/Intel/Composer XE/mkl
+
+if ( WIN32 )
+    set(MKLROOT_PATH "C:/Program Files (x86)/Intel/Composer XE" CACHE PATH "Where the MKL are stored")
+else ( WIN32 )
+    set(MKLROOT_PATH "/opt/intel" CACHE PATH "Where the MKL are stored")
+endif ( WIN32 )
+
+if (EXISTS ${MKLROOT_PATH}/mkl)
+    SET(MKL_FOUND TRUE)
+    message("MKL is found at ${MKLROOT_PATH}/mkl")
+    IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+        set( USE_MKL_64BIT On )
+        if ( ARMADILLO_FOUND )
+            if ( ARMADILLO_BLAS_LONG_LONG )
+                set( USE_MKL_64BIT_LIB On )
+                ADD_DEFINITIONS(-DMKL_ILP64)
+                message("MKL is linked against ILP64 interface ... ")
+            endif ( ARMADILLO_BLAS_LONG_LONG )
+        endif ( ARMADILLO_FOUND )
+    ELSE(CMAKE_SIZEOF_VOID_P EQUAL 8)
+        set( USE_MKL_64BIT Off )
+    ENDIF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+else (EXISTS ${MKLROOT_PATH}/mkl)
+    SET(MKL_FOUND FALSE)
+    message("MKL is NOT found ... ")
+endif (EXISTS ${MKLROOT_PATH}/mkl)
+
+if (MKL_FOUND)
+    set(MKL_INCLUDE_DIR "${MKLROOT_PATH}/mkl/include")
+    ADD_DEFINITIONS(-DUSE_MKL)
+    if ( USE_MKL_64BIT )
+        set(MKL_LIB_DIR "${MKLROOT_PATH}/mkl/lib/intel64")
+        set(MKL_COMPILER_LIB_DIR "${MKLROOT_PATH}/compiler/lib/intel64")
+        set(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB_DIR} "${MKLROOT_PATH}/lib/intel64")
+        if ( USE_MKL_64BIT_LIB )
+            if ( WIN32 )
+                set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_ilp64)
+            else ( WIN32 )
+                set(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_intel_ilp64.a)
+            endif ( WIN32 )
+        else ( USE_MKL_64BIT_LIB )
+            if ( WIN32 )
+                set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_lp64)
+            else ( WIN32 )
+                set(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_intel_lp64.a)
+            endif ( WIN32 )
+        endif ( USE_MKL_64BIT_LIB )
+    else ( USE_MKL_64BIT )
+        set(MKL_LIB_DIR "${MKLROOT_PATH}/mkl/lib/ia32")
+        set(MKL_COMPILER_LIB_DIR "${MKLROOT_PATH}/compiler/lib/ia32")
+        set(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB_DIR} "${MKLROOT_PATH}/lib/ia32")
+        if ( WIN32 )
+            set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_c)
+        else ( WIN32 )
+            set(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_intel.a)
+        endif ( WIN32 )
+    endif ( USE_MKL_64BIT )
+
+    if ( WIN32 )
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_thread)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_core)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} libiomp5md)
+    else ( WIN32 )
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_intel_thread.a)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} libmkl_core.a)
+        SET(MKL_LIBRARIES ${MKL_LIBRARIES} iomp5)
+    endif ( WIN32 )
+endif (MKL_FOUND)
+
+IF (MKL_FOUND)
+    IF (NOT MKL_FIND_QUIETLY)
+        MESSAGE(STATUS "Found MKL libraries: ${MKL_LIBRARIES}")
+        MESSAGE(STATUS "MKL_INCLUDE_DIR: ${MKL_INCLUDE_DIR}")
+        MESSAGE(STATUS "MKL_LIB_DIR: ${MKL_LIB_DIR}")
+        MESSAGE(STATUS "MKL_COMPILER_LIB_DIR: ${MKL_COMPILER_LIB_DIR}")
+    ENDIF (NOT MKL_FIND_QUIETLY)
+
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+ELSE (MKL_FOUND)
+    IF (MKL_FIND_REQUIRED)
+        MESSAGE(FATAL_ERROR "Could not find MKL libraries")
+    ENDIF (MKL_FIND_REQUIRED)
+ENDIF (MKL_FOUND)
+
+# MARK_AS_ADVANCED(MKL_LIBRARY)
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
new file mode 100644
index 0000000..e2aa5c4
--- /dev/null
+++ b/cmake/FindNumPy.cmake
@@ -0,0 +1,102 @@
+# - Find the NumPy libraries
+# This module finds if NumPy is installed, and sets the following variables
+# indicating where it is.
+#
+# TODO: Update to provide the libraries and paths for linking npymath lib.
+#
+#  NUMPY_FOUND               - was NumPy found
+#  NUMPY_VERSION             - the version of NumPy found as a string
+#  NUMPY_VERSION_MAJOR       - the major version number of NumPy
+#  NUMPY_VERSION_MINOR       - the minor version number of NumPy
+#  NUMPY_VERSION_PATCH       - the patch version number of NumPy
+#  NUMPY_VERSION_DECIMAL     - e.g. version 1.6.1 is 10601
+#  NUMPY_INCLUDE_DIRS        - path to the NumPy include files
+
+
+#============================================================================
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to permit
+# persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+#============================================================================
+
+# Finding NumPy involves calling the Python interpreter
+if(NumPy_FIND_REQUIRED)
+    find_package(PythonInterp REQUIRED)
+else()
+    find_package(PythonInterp)
+endif()
+
+if(NOT PYTHONINTERP_FOUND)
+    set(NUMPY_FOUND FALSE)
+    return()
+endif()
+
+execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "import numpy as n; print(n.__version__); print(n.get_include());"
+    RESULT_VARIABLE _NUMPY_SEARCH_SUCCESS
+    OUTPUT_VARIABLE _NUMPY_VALUES_OUTPUT
+    ERROR_VARIABLE _NUMPY_ERROR_VALUE
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if(NOT _NUMPY_SEARCH_SUCCESS MATCHES 0)
+    if(NumPy_FIND_REQUIRED)
+        message(FATAL_ERROR
+            "NumPy import failure:\n${_NUMPY_ERROR_VALUE}")
+    endif()
+    set(NUMPY_FOUND FALSE)
+    return()
+endif()
+
+# Convert the process output into a list
+string(REGEX REPLACE ";" "\\\\;" _NUMPY_VALUES ${_NUMPY_VALUES_OUTPUT})
+string(REGEX REPLACE "\n" ";" _NUMPY_VALUES ${_NUMPY_VALUES})
+list(GET _NUMPY_VALUES 0 NUMPY_VERSION)
+list(GET _NUMPY_VALUES 1 NUMPY_INCLUDE_DIRS)
+
+string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" _VER_CHECK "${NUMPY_VERSION}")
+if("${_VER_CHECK}" STREQUAL "")
+    # The output from Python was unexpected. Raise an error always
+    # here, because we found NumPy, but it appears to be corrupted somehow.
+    message(FATAL_ERROR
+        "Requested version and include path from NumPy, got instead:\n${_NUMPY_VALUES_OUTPUT}\n")
+    return()
+endif()
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIRS ${NUMPY_INCLUDE_DIRS})
+
+# Get the major and minor version numbers
+string(REGEX REPLACE "\\." ";" _NUMPY_VERSION_LIST ${NUMPY_VERSION})
+list(GET _NUMPY_VERSION_LIST 0 NUMPY_VERSION_MAJOR)
+list(GET _NUMPY_VERSION_LIST 1 NUMPY_VERSION_MINOR)
+list(GET _NUMPY_VERSION_LIST 2 NUMPY_VERSION_PATCH)
+string(REGEX MATCH "[0-9]*" NUMPY_VERSION_PATCH ${NUMPY_VERSION_PATCH})
+math(EXPR NUMPY_VERSION_DECIMAL
+    "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}")
+
+find_package_message(NUMPY
+    "Found NumPy: version \"${NUMPY_VERSION}\" ${NUMPY_INCLUDE_DIRS}"
+    "${NUMPY_INCLUDE_DIRS}${NUMPY_VERSION}")
+
+set(NUMPY_FOUND TRUE)
+
diff --git a/cmake/FindOctave.cmake b/cmake/FindOctave.cmake
new file mode 100644
index 0000000..52beab9
--- /dev/null
+++ b/cmake/FindOctave.cmake
@@ -0,0 +1,84 @@
+# Try to find the build flags to compile octave shared objects (oct and mex files)
+# Once done this will define
+#
+# OCTAVE_FOUND - if OCTAVE is found
+# OCTAVE_CXXFLAGS - extra flags
+# OCTAVE_INCLUDE_DIRS - include directories
+# OCTAVE_LINK_DIRS - link directories
+# OCTAVE_LIBRARY_RELEASE - the relase version
+# OCTAVE_LIBRARY_DEBUG - the debug version
+# OCTAVE_LIBRARY - a default library, with priority debug.
+
+# use mkoctfile
+set(MKOCTFILE_EXECUTABLE MKOCTFILE_EXECUTABLE-NOTFOUND)
+find_program(MKOCTFILE_EXECUTABLE NAME mkoctfile PATHS)
+mark_as_advanced(MKOCTFILE_EXECUTABLE)
+
+if(MKOCTFILE_EXECUTABLE)
+  set(OCTAVE_FOUND 1)
+
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p ALL_CXXFLAGS
+    OUTPUT_VARIABLE _mkoctfile_cppflags
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_cppflags "${_mkoctfile_cppflags}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p INCFLAGS
+    OUTPUT_VARIABLE _mkoctfile_includedir
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_includedir "${_mkoctfile_includedir}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p ALL_LDFLAGS
+    OUTPUT_VARIABLE _mkoctfile_ldflags
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_ldflags "${_mkoctfile_ldflags}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p LFLAGS
+    OUTPUT_VARIABLE _mkoctfile_lflags
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_lflags "${_mkoctfile_lflags}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p LIBS
+    OUTPUT_VARIABLE _mkoctfile_libs
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_libs "${_mkoctfile_libs}")
+  execute_process(
+    COMMAND ${MKOCTFILE_EXECUTABLE} -p OCTAVE_LIBS
+    OUTPUT_VARIABLE _mkoctfile_octlibs
+    RESULT_VARIABLE _mkoctfile_failed)
+  string(REGEX REPLACE "[\r\n]" " " _mkoctfile_octlibs "${_mkoctfile_octlibs}")
+  set(_mkoctfile_libs "${_mkoctfile_libs} ${_mkoctfile_octlibs}")
+
+  string(REGEX MATCHALL "(^| )-l([./+-_\\a-zA-Z]*)" _mkoctfile_libs "${_mkoctfile_libs}")
+  string(REGEX REPLACE "(^| )-l" "" _mkoctfile_libs "${_mkoctfile_libs}")
+
+  string(REGEX MATCHALL "(^| )-L([./+-_\\a-zA-Z]*)" _mkoctfile_ldirs "${_mkoctfile_lflags}")
+  string(REGEX REPLACE "(^| )-L" "" _mkoctfile_ldirs "${_mkoctfile_ldirs}")
+
+  string(REGEX MATCHALL "(^| )-I([./+-_\\a-zA-Z]*)" _mkoctfile_includedir "${_mkoctfile_includedir}")
+  string(REGEX REPLACE "(^| )-I" "" _mkoctfile_includedir "${_mkoctfile_includedir}")
+
+  string(REGEX REPLACE "(^| )-l([./+-_\\a-zA-Z]*)" " " _mkoctfile_ldflags "${_mkoctfile_ldflags}")
+  string(REGEX REPLACE "(^| )-L([./+-_\\a-zA-Z]*)" " " _mkoctfile_ldflags "${_mkoctfile_ldflags}")
+
+  separate_arguments(_mkoctfile_includedir)
+
+  set( OCTAVE_CXXFLAGS "${_mkoctfile_cppflags}" )
+  set( OCTAVE_LINK_FLAGS "${_mkoctfile_ldflags}" )
+  set( OCTAVE_INCLUDE_DIRS ${_mkoctfile_includedir})
+  set( OCTAVE_LINK_DIRS ${_mkoctfile_ldirs})
+  set( OCTAVE_LIBRARY ${_mkoctfile_libs})
+  set( OCTAVE_LIBRARY_RELEASE ${OCTAVE_LIBRARY})
+  set( OCTAVE_LIBRARY_DEBUG ${OCTAVE_LIBRARY})
+endif(MKOCTFILE_EXECUTABLE)
+
+MARK_AS_ADVANCED(
+    OCTAVE_LIBRARY_FOUND
+    OCTAVE_CXXFLAGS
+    OCTAVE_LINK_FLAGS
+    OCTAVE_INCLUDE_DIRS
+    OCTAVE_LINK_DIRS
+    OCTAVE_LIBRARY
+    OCTAVE_LIBRARY_RELEASE
+    OCTAVE_LIBRARY_DEBUG
+)
diff --git a/cmake/FindTinyXML.cmake b/cmake/FindTinyXML.cmake
new file mode 100644
index 0000000..2f3bc3c
--- /dev/null
+++ b/cmake/FindTinyXML.cmake
@@ -0,0 +1,26 @@
+# - Find TinyXML
+# Find the native TinyXML includes and library
+#
+#   TINYXML_FOUND       - True if TinyXML found.
+#   TINYXML_INCLUDE_DIR - where to find tinyxml.h, etc.
+#   TINYXML_LIBRARIES   - List of libraries when using TinyXML.
+#
+
+IF( TINYXML_INCLUDE_DIR )
+    # Already in cache, be silent
+    SET( TinyXML_FIND_QUIETLY TRUE )
+ENDIF( TINYXML_INCLUDE_DIR )
+
+FIND_PATH( TINYXML_INCLUDE_DIR "tinyxml.h"
+           PATH_SUFFIXES "tinyxml" )
+
+FIND_LIBRARY( TINYXML_LIBRARIES
+              NAMES "tinyxml"
+              PATH_SUFFIXES "tinyxml" )
+
+# handle the QUIETLY and REQUIRED arguments and set TINYXML_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE( "FindPackageHandleStandardArgs" )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( "TinyXML" DEFAULT_MSG TINYXML_INCLUDE_DIR TINYXML_LIBRARIES )
+
+MARK_AS_ADVANCED( TINYXML_INCLUDE_DIR TINYXML_LIBRARIES )
diff --git a/cmake/FindXSD.cmake b/cmake/FindXSD.cmake
new file mode 100644
index 0000000..59a8834
--- /dev/null
+++ b/cmake/FindXSD.cmake
@@ -0,0 +1,68 @@
+# - Find CodeSynthesis XSD
+# This module can be used to find XSD and it's include path
+# Variables:
+#	XSD_FOUND - System has XSD
+#	XSD_EXECUTABLE - XSD binary executable
+#	XSD_INCLUDE_DIR - XSD include directory
+#
+# Functions:
+#       WRAP_XSD - Generates C++ bindings in the given output directory for a given schema file
+
+if(NOT DEFINED XSD_DIR AND DEFINED ENV{XSD_DIR})
+    set(XSD_DIR $ENV{XSD_DIR})
+endif(NOT DEFINED XSD_DIR AND DEFINED ENV{XSD_DIR})
+
+find_program(XSD_EXECUTABLE NAMES xsd xsdcxx xsd.exe
+    PATHS ${XSD_DIR} /usr /usr/local
+    PATH_SUFFIXES bin
+)
+
+find_path(XSD_INCLUDE_DIR NAMES xsd/cxx/pre.hxx
+    PATHS ${XSD_DIR} /usr /usr/local
+    PATH_SUFFIXES include
+)
+
+FUNCTION(XSD_EXTRACT_OPTIONS _xsd_files _xsd_options)
+	foreach(current_arg ${ARGN})
+		IF(${current_arg} STREQUAL "OPTIONS")
+			SET(_XSD_DOING_OPTIONS TRUE)
+		else(${current_arg} STREQUAL "OPTIONS")
+			if(_XSD_DOING_OPTIONS)
+				SET(_xsd_options_p ${_xsd_options_p} ${current_arg})
+			else(_XSD_DOING_OPTIONS)
+				SET(_xsd_files_p ${_xsd_files_p} ${current_arg})
+			endif(_XSD_DOING_OPTIONS)
+		endif(${current_arg} STREQUAL "OPTIONS")
+	endforeach(current_arg)
+	SET(${_xsd_files} ${_xsd_files_p} PARENT_SCOPE)
+	SET(${_xsd_options} ${_xsd_options_p} PARENT_SCOPE)
+ENDFUNCTION(XSD_EXTRACT_OPTIONS)
+
+
+FUNCTION(WRAP_XSD XSD_SRCS XSD_INCLUDES OUT_PATH)
+	SET(OUTPUT_DIR  ${CMAKE_CURRENT_BINARY_DIR}/src/xsd)
+	FILE(MAKE_DIRECTORY ${OUTPUT_DIR})
+	SET(${XSD_INCLUDES} ${OUTPUT_DIR} PARENT_SCOPE)
+	XSD_EXTRACT_OPTIONS(xsd_files xsd_options ${ARGN})
+	FOREACH(it ${xsd_files})
+		STRING(REGEX REPLACE ".*/" "" BARE_XSD "${it}" )
+		STRING(REGEX REPLACE ".xsd" ".cxx" SOURCE "${BARE_XSD}" )
+		STRING(REGEX REPLACE ".xsd" ".hxx" HEADER "${BARE_XSD}" )
+		CONFIGURE_FILE(${it} ${OUT_PATH}/${BARE_XSD} COPY_ONLY)
+		SET(SOURCE ${OUTPUT_DIR}/${SOURCE})
+		SET(HEADER ${OUTPUT_DIR}/${HEADER})
+		ADD_CUSTOM_COMMAND(OUTPUT ${SOURCE} ${HEADER}
+				COMMAND ${XSD_EXECUTABLE} ${xsd_options} "--output-dir" ${OUTPUT_DIR} ${OUT_PATH}/${BARE_XSD}
+				DEPENDS ${it}
+				VERBATIM
+		)
+		set_source_files_properties(${HEADER} PROPERTIES GENERATED TRUE)
+		set_source_files_properties(${SOURCE} PROPERTIES GENERATED TRUE)
+		SET(_XSD_SRCS ${_XSD_SRCS} ${SOURCE} ${HEADER})
+	ENDFOREACH(it)
+	SET(${XSD_SRCS} ${_XSD_SRCS} PARENT_SCOPE)
+ENDFUNCTION(WRAP_XSD)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(XSD DEFAULT_MSG XSD_INCLUDE_DIR XSD_EXECUTABLE)
+mark_as_advanced(XSD_INCLUDE_DIR XSD_EXECUTABLE)
diff --git a/cmake/FindXalanC.cmake b/cmake/FindXalanC.cmake
new file mode 100644
index 0000000..096feaa
--- /dev/null
+++ b/cmake/FindXalanC.cmake
@@ -0,0 +1,35 @@
+# - Try to find XalanC
+# Once done this will define
+#
+#  XALANC_FOUND - System has XalanC
+#  XALANC_INCLUDE_DIR - The XalanC include directory
+#  XALANC_LIBRARY_DIR - The XalanC library dir
+#  XALANC_LIBRARIES - The libraries needed to use XalanC
+#  XALANC_DEFINITIONS - Compiler switches required for using XalanC
+
+# Copyright (c) 2009, Helio Chissini de Castro, <helio at kde.org>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+
+IF (XALANC_INCLUDE_DIR AND XALANC_LIBRARIES)
+   # in cache already
+   SET(XalanC_FIND_QUIETLY TRUE)
+ENDIF (XALANC_INCLUDE_DIR AND XALANC_LIBRARIES)
+
+
+FIND_PATH(XALANC_INCLUDE_DIR DOMSupport/DOMServices.hpp
+	PATHS
+	/usr/local/include/xalanc
+	/usr/include/xalanc
+	PATH_SUFFIXES
+	xalanc
+	)
+
+FIND_LIBRARY(XALANC_LIBRARIES NAMES xalan-c xalanMsg)
+
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(XalanC DEFAULT_MSG XALANC_LIBRARIES XALANC_INCLUDE_DIR)
+
+MARK_AS_ADVANCED(XALANC_INCLUDE_DIR XALANC_LIBRARIES XALANC_LIBRARY_DIR)
diff --git a/cmake/FindXercesC.cmake b/cmake/FindXercesC.cmake
new file mode 100644
index 0000000..3bb9b81
--- /dev/null
+++ b/cmake/FindXercesC.cmake
@@ -0,0 +1,37 @@
+# This module defines
+# XERCESC_INCLUDE_DIR, where to find ptlib.h, etc.
+# XERCESC_LIBRARIES, the libraries to link against to use pwlib.
+# XERCESC_FOUND, If false, don't try to use pwlib.
+
+FIND_PATH(XERCESC_INCLUDE_DIR xercesc/dom/DOM.hpp
+  "[HKEY_CURRENT_USER\\software\\xerces-c\\src]"
+  "[HKEY_CURRENT_USER\\xerces-c\\src]"
+  $ENV{XERCESCROOT}/src/
+  /usr/local/include
+  /usr/include
+)
+
+FIND_LIBRARY(XERCESC_LIBRARIES
+  NAMES 
+    xerces-c
+  PATHS
+    "[HKEY_CURRENT_USER\\software\\xerces-c\\lib]"
+    "[HKEY_CURRENT_USER\\xerces-c\\lib]"
+    $ENV{XERCESCROOT}/lib
+    /usr/local/lib
+    /usr/lib
+)
+
+# if the include a the library are found then we have it
+IF(XERCESC_INCLUDE_DIR)
+  IF(XERCESC_LIBRARIES)
+    SET( XERCESC_FOUND "YES" )
+  ENDIF(XERCESC_LIBRARIES)
+ENDIF(XERCESC_INCLUDE_DIR)
+
+
+
+MARK_AS_ADVANCED(
+  XERCESC_INCLUDE_DIR
+  XERCESC_LIBRARIES
+) 
\ No newline at end of file
diff --git a/doc/.gitignore b/doc/.gitignore
new file mode 100644
index 0000000..c8c6066
--- /dev/null
+++ b/doc/.gitignore
@@ -0,0 +1,7 @@
+death_row/figures/*.pdf
+*.gz
+*.log
+*.toc
+*.bbl
+*.blg
+*.aux
\ No newline at end of file
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 0000000..76e79ff
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1,30 @@
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+	add_subdirectory(doxygen)
+else(DOXYGEN_FOUND)
+	MESSAGE("Doxygen not found. Will not be able to build documentation")
+endif(DOXYGEN_FOUND)
+
+find_file(XSLTPROCEXE xsltproc
+		HINTS /usr/bin
+		/usr/local/bin)
+
+find_path(DOCBOOK_XSL_DIR html/docbook.xsl
+  PATHS /usr/local/share/xml/xls/docbook-xsl-ns
+  		/usr/share/xml/xls/docbook-xsl-ns
+  		/usr/share/xml/docbook 
+ 		/usr/share/xml/docbook/stylesheet/docbook-xsl-ns/
+  		/usr/share/sgml/docbook/xsl-ns-stylesheets
+       $ENV{DOCBOOKDIR}
+  NO_DEFAULT_PATH)
+
+if (NOT DOCBOOK_XSL_DIR)
+  message("Could not find HTML docbook.xsl, try to set DOCBOOKDIR")
+endif (NOT DOCBOOK_XSL_DIR)
+
+if (XSLTPROCEXE AND DOCBOOK_XSL_DIR)
+	MESSAGE("Docbook tools found, building manual XSLTPROCEXE: ${XSLTPROCEXE}")
+	add_subdirectory(manual)
+else (XSLTPROCEXE AND DOCBOOK_XSL_DIR)
+    MESSAGE("Docbook tools not found")
+endif (XSLTPROCEXE AND DOCBOOK_XSL_DIR)
diff --git a/doc/doxygen/CMakeLists.txt b/doc/doxygen/CMakeLists.txt
new file mode 100644
index 0000000..caa1691
--- /dev/null
+++ b/doc/doxygen/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(DOXYGEN_FOUND)
+	configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
+	add_custom_target(apidoc ${DOXYGEN_EXECUTABLE}
+	${CMAKE_CURRENT_BINARY_DIR}/Doxyfile WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+		COMMENT "Generating API documentation with Doxygen" VERBATIM)
+else(DOXYGEN_FOUND)
+	MESSAGE("Doxygen not found. Will not be able to build documentation")
+endif(DOXYGEN_FOUND)
\ No newline at end of file
diff --git a/doc/doxygen/Doxyfile.in b/doc/doxygen/Doxyfile.in
new file mode 100644
index 0000000..ced4993
--- /dev/null
+++ b/doc/doxygen/Doxyfile.in
@@ -0,0 +1,1757 @@
+# Doxyfile 1.7.5.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = Gadgetron
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = A Streaming Framework for Medical Image Reconstruction
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/../../apps/ @CMAKE_CURRENT_SOURCE_DIR@/../../toolboxes/ @CMAKE_CURRENT_SOURCE_DIR@/../../gadgets/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to directory from which doxygen is run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is adviced to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/doc/manual/CMakeLists.txt b/doc/manual/CMakeLists.txt
new file mode 100644
index 0000000..2603493
--- /dev/null
+++ b/doc/manual/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+
+#configure_file(${CMAKE_CURRENT_SOURCE_DIR}/figs ${CMAKE_CURRENT_BINARY_DIR}/ COPYONLY)
+
+#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/figs DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+set(XSLTPROC_PARAMS --xinclude --stringparam use.role.for.mediaobject 1 --stringparam section.autolabel 1 --stringparam section.label.includes.component.label 1)
+
+add_custom_target(htmlmanual ${XSLTPROCEXE} ${XSLTPROC_PARAMS} -o ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_manual.html
+${DOCBOOK_XSL_DIR}/html/docbook.xsl ${CMAKE_CURRENT_SOURCE_DIR}/gadgetron_manual.xml  
+DEPENDS manualfigs 
+WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+COMMENT "Generating HTML docbook manual" VERBATIM)
+
+add_custom_command(OUTPUT manualfigs COMMAND ${CMAKE_COMMAND} ARGS -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/figs ${CMAKE_CURRENT_BINARY_DIR}/figs
+	COMMENT  "Copying figure files" VERBATIM)
+
+find_file(FOPEXE fop
+		HINTS /usr/bin
+		/usr/local/fop
+		/usr/local/bin)
+
+if (FOPEXE)
+	MESSAGE("FOP found, Building PDF Manual, FOPEXE: ${FOPEXE}")
+	
+	add_custom_target(pdfmanual ${FOPEXE} ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_manual.fo ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_manual.pdf  DEPENDS fomanual manualfigs
+	COMMENT "Generating PDF docbook manual" VERBATIM)
+	
+	add_custom_command(OUTPUT fomanual COMMAND ${XSLTPROCEXE} ${XSLTPROC_PARAMS} -o ${CMAKE_CURRENT_BINARY_DIR}/gadgetron_manual.fo
+	${DOCBOOK_XSL_DIR}/fo/docbook.xsl ${CMAKE_CURRENT_SOURCE_DIR}/gadgetron_manual.xml  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+	COMMENT "Generating FO file for PDF docbook manual" VERBATIM)
+	
+	
+else (FOPEXE)
+	MESSAGE("FOP executable not found, PDF manual cannot be build")
+endif(FOPEXE)
\ No newline at end of file
diff --git a/doc/manual/figs/Gadgetron.png b/doc/manual/figs/Gadgetron.png
new file mode 100644
index 0000000..19d3f07
Binary files /dev/null and b/doc/manual/figs/Gadgetron.png differ
diff --git a/doc/manual/figs/Gadgetron.svg b/doc/manual/figs/Gadgetron.svg
new file mode 100644
index 0000000..82bbf65
--- /dev/null
+++ b/doc/manual/figs/Gadgetron.svg
@@ -0,0 +1,1736 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 15.0.2, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 width="595.28px" height="841.89px" viewBox="0 0 595.28 841.89" enable-background="new 0 0 595.28 841.89" xml:space="preserve">
+<g>
+	<g>
+		<g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M218.69,432.847l-1.143,3.861c1.968-0.576,3.913-2.162,4.833-4.514
+						c-0.895-0.346-1.805-0.65-2.721-0.934C219.465,431.891,219.113,432.419,218.69,432.847z"/>
+					<path d="M219.659,431.261c-0.174,0.564-0.511,1.036-0.919,1.419l-0.103,0.348C219.078,432.552,219.444,431.959,219.659,431.261
+						z"/>
+					<path fill="#988A38" d="M219.659,431.261c0.916,0.283,1.826,0.588,2.721,0.934c0.919-2.346,0.336-4.944-1.525-6.658
+						l-1.143,3.861C219.855,429.995,219.853,430.631,219.659,431.261z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M222.335,432.178c0.157-0.41-0.041-0.877-0.449-1.02c-0.408-0.144-0.854,0.079-0.994,0.498
+								c-0.505,1.473-1.582,2.54-2.779,3.137l-0.55,1.859C219.508,436.073,221.43,434.5,222.335,432.178z"/>
+							<path fill="#988A38" d="M222.176,432.387c0.147-0.364-0.027-0.782-0.386-0.911c-0.359-0.132-0.752,0.063-0.887,0.434
+								c-0.535,1.436-1.638,2.462-2.833,3.034l-0.491,1.658C219.435,436.048,221.283,434.571,222.176,432.387z"/>
+							<path fill="#988A38" d="M222.016,432.597c0.136-0.317-0.011-0.689-0.324-0.806c-0.31-0.119-0.653,0.05-0.78,0.371
+								c-0.566,1.4-1.696,2.384-2.887,2.932l-0.432,1.46C219.362,436.019,221.133,434.644,222.016,432.597z"/>
+							<path fill="#988A38" d="M221.86,432.804c0.122-0.27,0-0.592-0.265-0.697c-0.263-0.107-0.557,0.034-0.676,0.308
+								c-0.6,1.368-1.75,2.306-2.938,2.831l-0.373,1.258C219.289,435.992,220.983,434.716,221.86,432.804z"/>
+							<path fill="#988A38" d="M221.702,433.014c0.11-0.224,0.014-0.497-0.205-0.59c-0.218-0.096-0.465,0.019-0.571,0.243
+								c-0.641,1.333-1.804,2.227-2.991,2.729l-0.312,1.056C219.216,435.96,220.825,434.791,221.702,433.014z"/>
+							<path fill="#988A38" d="M221.545,433.223c0.095-0.177,0.027-0.402-0.146-0.485s-0.374,0.003-0.468,0.181
+								c-0.687,1.298-1.857,2.15-3.041,2.63l-0.253,0.854C219.143,435.931,220.662,434.865,221.545,433.223z"/>
+							<path fill="#FEE676" d="M221.391,433.434c0.079-0.131,0.038-0.309-0.089-0.38c-0.127-0.072-0.285-0.019-0.365,0.116
+								c-0.745,1.252-1.909,2.073-3.092,2.53l-0.193,0.651C219.071,435.898,220.487,434.934,221.391,433.434z"/>
+							<path fill="#FEE676" d="M221.236,433.646c0.063-0.091,0.049-0.214-0.033-0.275c-0.083-0.062-0.2-0.038-0.262,0.051
+								c-0.828,1.188-1.96,1.997-3.14,2.429l-0.134,0.453C218.997,435.87,220.289,434.981,221.236,433.646z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M219.754,431.215c-0.096,0.312,0.066,0.641,0.362,0.74c0.294,0.096,0.621-0.079,0.726-0.39
+								c0.483-1.417,0.254-2.914-0.564-4.086l-0.5,1.688C219.966,429.819,219.969,430.522,219.754,431.215z"/>
+							<path fill="#988A38" d="M219.825,431.08c-0.08,0.28,0.065,0.57,0.33,0.657c0.265,0.08,0.555-0.077,0.644-0.354
+								c0.424-1.32,0.195-2.704-0.552-3.798l-0.446,1.507C219.992,429.725,220.018,430.408,219.825,431.08z"/>
+							<path fill="#988A38" d="M219.896,430.946c-0.064,0.246,0.064,0.499,0.298,0.571c0.233,0.068,0.487-0.073,0.56-0.318
+								c0.365-1.219,0.136-2.49-0.539-3.507l-0.393,1.326C220.019,429.628,220.068,430.291,219.896,430.946z"/>
+							<path fill="#988A38" d="M219.967,430.812c-0.049,0.212,0.062,0.428,0.266,0.485c0.203,0.057,0.42-0.068,0.478-0.281
+								c0.302-1.123,0.079-2.281-0.526-3.217l-0.338,1.144C220.045,429.534,220.116,430.175,219.967,430.812z"/>
+							<path fill="#988A38" d="M220.038,430.679c-0.035,0.178,0.062,0.356,0.234,0.398c0.17,0.044,0.351-0.065,0.393-0.243
+								c0.241-1.024,0.023-2.072-0.512-2.927l-0.285,0.963C220.071,429.438,220.164,430.059,220.038,430.679z"/>
+							<path fill="#988A38" d="M220.109,430.546c-0.022,0.143,0.059,0.284,0.201,0.312c0.138,0.03,0.281-0.062,0.309-0.206
+								c0.177-0.929-0.032-1.86-0.498-2.639l-0.231,0.783C220.097,429.346,220.207,429.939,220.109,430.546z"/>
+							<path fill="#FEE676" d="M220.182,430.41c-0.012,0.109,0.056,0.215,0.164,0.23c0.108,0.014,0.211-0.063,0.226-0.175
+								c0.106-0.832-0.087-1.646-0.482-2.346l-0.178,0.602C220.123,429.248,220.245,429.82,220.182,430.41z"/>
+							<path fill="#FEE676" d="M220.252,430.276c-0.002,0.077,0.057,0.139,0.131,0.144c0.076,0.001,0.138-0.061,0.14-0.136
+								c0.022-0.738-0.14-1.435-0.466-2.058l-0.125,0.42C220.149,429.153,220.266,429.701,220.252,430.276z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M222.379,432.194c0.96-2.453,0.446-5.135-1.455-6.896l-0.142,0.48C222.603,427.447,223.255,429.963,222.379,432.194z"
+							/>
+					</g>
+					<path d="M222.787,430.4c0.089-0.818,0.031-1.622-0.193-2.39c-0.105,1.247-1.66,1.084-2.672,0.675l-0.205,0.69
+						C220.752,429.676,221.774,430.025,222.787,430.4z"/>
+					<path fill="#988A38" d="M232.737,440.408c0.497,0.528,1.378,0.584,2.042,0.072c0.649-0.5,0.754-1.476,0.163-2.106
+						c-2.071-2.237-4.537-3.999-7.133-5.42c-2.604-1.417-5.353-2.505-8.162-3.339l-0.852,2.877
+						C224.027,434.056,229.126,436.544,232.737,440.408z"/>
+					<path d="M234.758,438.543c0.488,0.518,0.563,1.521,0.02,1.938c0.753-0.582,1.04-1.53,0.347-2.276
+						c-2.095-2.263-4.583-4.039-7.197-5.47c-2.623-1.428-5.387-2.521-8.211-3.358l-0.142,0.479c2.794,0.829,5.527,1.911,8.113,3.319
+						C230.267,434.584,232.711,436.332,234.758,438.543z"/>
+					<g>
+						<path fill="#988A38" d="M233.001,440.401c0.27,0.288,0.743,0.309,1.084,0.027c0.34-0.28,0.397-0.784,0.102-1.101
+							c-3.967-4.308-9.383-6.908-14.881-8.562l-0.454,1.533C224.163,433.889,229.351,436.422,233.001,440.401z"/>
+						<path fill="#988A38" d="M232.769,440.058c0.237,0.261,0.67,0.263,0.967,0.004s0.329-0.701,0.07-0.985
+							c-3.931-4.086-9.183-6.594-14.532-8.201l-0.405,1.369C224.055,433.795,229.109,436.243,232.769,440.058z"/>
+						<path fill="#988A38" d="M232.521,439.724c0.221,0.221,0.603,0.213,0.855-0.02c0.259-0.238,0.278-0.627,0.041-0.863
+							c-3.878-3.885-8.976-6.298-14.175-7.858l-0.357,1.206C223.944,433.702,228.874,436.062,232.521,439.724z"/>
+						<path fill="#988A38" d="M232.271,439.387c0.193,0.192,0.519,0.181,0.735-0.028c0.215-0.209,0.227-0.546,0.022-0.75
+							c-3.833-3.676-8.765-6.009-13.818-7.519l-0.308,1.041C223.838,433.602,228.629,435.89,232.271,439.387z"/>
+						<path fill="#988A38" d="M232.008,439.059c0.167,0.157,0.447,0.14,0.624-0.041s0.177-0.462,0.001-0.627
+							c-3.767-3.492-8.55-5.729-13.455-7.193l-0.26,0.877C223.727,433.506,228.397,435.706,232.008,439.059z"/>
+						<path fill="#FEE676" d="M231.743,438.73c0.139,0.126,0.362,0.11,0.502-0.039c0.139-0.149,0.14-0.382-0.005-0.513
+							c-3.71-3.299-8.335-5.457-13.094-6.873l-0.21,0.712C223.617,433.409,228.153,435.536,231.743,438.73z"/>
+						<path fill="#FEE676" d="M231.466,438.412c0.108,0.094,0.282,0.077,0.387-0.041s0.099-0.294-0.013-0.391
+							c-3.639-3.127-8.117-5.193-12.726-6.567l-0.162,0.548C223.504,433.316,227.916,435.36,231.466,438.412z"/>
+						<path fill="#FEE676" d="M231.185,438.091c0.079,0.064,0.198,0.053,0.27-0.032c0.071-0.083,0.067-0.209-0.014-0.274
+							c-3.566-2.957-7.899-4.931-12.359-6.263l-0.113,0.383C223.39,433.225,227.678,435.182,231.185,438.091z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M215.965,430.212c-0.176,0.813,0.292,1.598,1.035,1.789c0.603,0.149,1.2,0.315,1.795,0.491l0.852-2.877
+						c-0.634-0.187-1.271-0.364-1.912-0.523C216.936,428.887,216.139,429.407,215.965,430.212z"/>
+					<g>
+						<path fill="#988A38" d="M216.253,430.792c-0.099,0.434,0.149,0.853,0.548,0.953c0.687,0.171,1.371,0.354,2.049,0.556
+							l0.454-1.535c-0.702-0.209-1.409-0.398-2.12-0.575C216.77,430.086,216.351,430.363,216.253,430.792z"/>
+						<path fill="#988A38" d="M216.549,430.888c-0.09,0.383,0.129,0.762,0.486,0.854c0.615,0.152,1.226,0.322,1.833,0.501
+							l0.405-1.368c-0.626-0.185-1.255-0.359-1.889-0.517C217.015,430.262,216.64,430.503,216.549,430.888z"/>
+						<path fill="#988A38" d="M216.846,430.984c-0.081,0.337,0.108,0.677,0.423,0.757c0.542,0.136,1.08,0.288,1.616,0.446
+							l0.357-1.206c-0.551-0.162-1.103-0.318-1.66-0.458C217.257,430.44,216.928,430.645,216.846,430.984z"/>
+						<path fill="#988A38" d="M217.139,431.086c-0.072,0.292,0.091,0.586,0.364,0.655c0.467,0.125,0.935,0.251,1.398,0.391
+							l0.308-1.042c-0.474-0.144-0.953-0.271-1.431-0.4C217.499,430.619,217.212,430.794,217.139,431.086z"/>
+						<path fill="#988A38" d="M217.434,431.187c-0.063,0.247,0.074,0.493,0.303,0.555l1.181,0.334l0.26-0.878l-1.204-0.341
+							C217.739,430.793,217.497,430.942,217.434,431.187z"/>
+						<path fill="#FEE676" d="M217.728,431.29c-0.052,0.197,0.057,0.401,0.244,0.454l0.963,0.273l0.211-0.713l-0.979-0.278
+							C217.976,430.974,217.78,431.089,217.728,431.29z"/>
+						<path fill="#FEE676" d="M218.019,431.396c-0.042,0.156,0.042,0.312,0.187,0.353l0.746,0.214l0.162-0.548l-0.754-0.216
+							C218.212,431.157,218.061,431.241,218.019,431.396z"/>
+						<path fill="#FEE676" d="M218.311,431.504c-0.03,0.108,0.029,0.219,0.131,0.247l0.527,0.153l0.114-0.384l-0.532-0.154
+							C218.447,431.338,218.34,431.396,218.311,431.504z"/>
+					</g>
+					<path d="M215.965,430.212c0.145-0.67,1.048-1.051,1.708-0.878c0.637,0.158,1.271,0.335,1.902,0.521l0.142-0.479
+						c-0.637-0.188-1.277-0.366-1.921-0.525C216.859,428.612,216.169,429.273,215.965,430.212z"/>
+					<path fill="#988A38" d="M207.598,422.53c-1.971-0.097-3.775,0.661-4.977,1.902c-1.212,1.239-1.817,2.957-1.675,4.767
+						c0.144,1.823,0.95,3.403,2.029,4.561c1.088,1.154,2.452,1.881,3.974,1.955c2.662,0.128,5.321,0.496,7.897,1.087
+						c0.91,0.205,1.817,0.158,2.701-0.093l1.143-3.862c-0.86,0.859-2.025,1.292-3.172,1.031c-2.755-0.632-5.585-1.022-8.421-1.159
+						c-1.747-0.084-3.216-1.671-3.275-3.65c-0.06-1.987,1.573-3.64,3.629-3.542c3.255,0.158,6.492,0.603,9.68,1.333
+						c1.336,0.305,2.293,1.333,2.58,2.539l1.144-3.862c-0.816-0.756-1.862-1.331-3.052-1.601
+						C214.434,423.164,211.027,422.697,207.598,422.53z"/>
+					<path fill="#988A38" d="M207.516,424.185c-2.859-0.141-5.075,2.167-4.916,4.85c0.014,0.328,0.277,0.581,0.588,0.571
+						c0.311-0.007,0.561-0.279,0.555-0.606c-0.065-2.031,1.605-3.726,3.715-3.625c3.264,0.158,6.509,0.604,9.707,1.337
+						c1.317,0.3,2.283,1.285,2.614,2.455l0.499-1.686c-0.638-0.926-1.627-1.653-2.847-1.93
+						C214.163,424.802,210.85,424.347,207.516,424.185z"/>
+					<path fill="#988A38" d="M207.513,424.273c-2.756-0.134-4.915,2.028-4.82,4.596c0.007,0.293,0.241,0.52,0.52,0.518
+						c0.278-0.005,0.499-0.245,0.499-0.537c-0.017-1.986,1.655-3.614,3.749-3.515c3.266,0.159,6.514,0.605,9.712,1.338
+						c1.316,0.3,2.28,1.266,2.628,2.421l0.446-1.507c-0.621-0.936-1.607-1.672-2.835-1.95
+						C214.148,424.89,210.841,424.436,207.513,424.273z"/>
+					<path fill="#988A38" d="M207.507,424.362c-2.65-0.131-4.747,1.895-4.72,4.341c-0.001,0.258,0.205,0.458,0.449,0.459
+						c0.244,0.002,0.438-0.206,0.445-0.459c0.033-1.946,1.705-3.504,3.78-3.404c3.269,0.158,6.518,0.605,9.72,1.339
+						c1.316,0.299,2.277,1.242,2.642,2.381l0.392-1.325c-0.605-0.945-1.585-1.689-2.824-1.97
+						C214.133,424.978,210.831,424.523,207.507,424.362z"/>
+					<path fill="#988A38" d="M207.503,424.452c-2.546-0.126-4.577,1.761-4.622,4.087c-0.006,0.221,0.168,0.396,0.379,0.402
+						s0.379-0.17,0.39-0.388c0.085-1.903,1.754-3.393,3.813-3.294c3.271,0.159,6.522,0.606,9.727,1.34
+						c1.315,0.3,2.272,1.225,2.654,2.346l0.339-1.146c-0.588-0.953-1.565-1.704-2.812-1.987
+						C214.119,425.066,210.821,424.613,207.503,424.452z"/>
+					<path fill="#988A38" d="M207.499,424.543c-2.444-0.121-4.399,1.626-4.522,3.832c-0.013,0.184,0.132,0.336,0.308,0.346
+						c0.177,0.007,0.319-0.135,0.336-0.313c0.146-1.867,1.802-3.281,3.845-3.184c3.273,0.159,6.527,0.606,9.733,1.341
+						c1.314,0.299,2.27,1.201,2.668,2.305l0.285-0.963c-0.57-0.964-1.544-1.722-2.801-2.007
+						C214.104,425.156,210.811,424.704,207.499,424.543z"/>
+					<path fill="#988A38" d="M207.494,424.632c-2.341-0.116-4.212,1.491-4.421,3.579c-0.017,0.146,0.096,0.275,0.237,0.288
+						c0.143,0.013,0.262-0.099,0.279-0.241c0.217-1.826,1.854-3.17,3.878-3.073c3.275,0.159,6.531,0.606,9.739,1.342
+						c1.314,0.3,2.266,1.182,2.682,2.27l0.232-0.783c-0.554-0.975-1.524-1.739-2.79-2.026
+						C214.089,425.244,210.802,424.792,207.494,424.632z"/>
+					<path fill="#FEE676" d="M207.49,424.721c-2.239-0.109-3.999,1.359-4.32,3.326c-0.02,0.111,0.059,0.215,0.165,0.23
+						c0.108,0.016,0.204-0.061,0.223-0.169c0.311-1.781,1.904-3.058,3.911-2.963c3.278,0.16,6.536,0.607,9.747,1.343
+						c1.314,0.3,2.262,1.161,2.695,2.232l0.178-0.602c-0.538-0.982-1.503-1.756-2.778-2.046
+						C214.075,425.332,210.792,424.881,207.49,424.721z"/>
+					<path fill="#FEE676" d="M207.485,424.811c-2.136-0.103-3.761,1.236-4.22,3.074c-0.021,0.076,0.022,0.15,0.094,0.171
+						c0.073,0.02,0.146-0.024,0.166-0.095c0.441-1.728,1.955-2.945,3.945-2.853c3.28,0.159,6.54,0.607,9.753,1.344
+						c1.313,0.299,2.26,1.14,2.709,2.193l0.124-0.418c-0.52-0.995-1.482-1.774-2.766-2.066
+						C214.06,425.421,210.783,424.971,207.485,424.811z"/>
+					<path d="M219.186,428.247c0.242,0.342,0.419,0.717,0.519,1.125l0.012,0.003l0.204-0.689c-0.309-0.133-0.573-0.284-0.726-0.449
+						L219.186,428.247z"/>
+					<path fill="#988A38" d="M207.03,434.066c-2.325-0.119-4.339-2.232-4.503-4.959c-0.018-0.438-0.383-0.789-0.814-0.76
+						c-0.431,0.028-0.753,0.408-0.719,0.848c0.14,1.807,0.941,3.376,2.012,4.525c1.081,1.146,2.436,1.869,3.946,1.942
+						c2.665,0.129,5.327,0.497,7.906,1.088c0.911,0.206,1.82,0.159,2.705-0.098l0.551-1.861c-0.916,0.449-1.907,0.625-2.898,0.401
+						C212.541,434.58,209.788,434.199,207.03,434.066z"/>
+					<path fill="#988A38" d="M207.024,434.194c-2.302-0.12-4.348-2.16-4.578-4.847c-0.025-0.392-0.349-0.701-0.73-0.668
+						c-0.381,0.028-0.666,0.37-0.627,0.763c0.167,1.733,0.964,3.234,2.018,4.328c1.063,1.094,2.387,1.78,3.847,1.852
+						c2.667,0.129,5.332,0.497,7.913,1.089c0.911,0.205,1.822,0.155,2.711-0.108l0.491-1.66c-0.916,0.434-1.9,0.597-2.883,0.375
+						C212.52,434.706,209.775,434.327,207.024,434.194z"/>
+					<path fill="#988A38" d="M207.018,434.323c-2.278-0.12-4.352-2.087-4.651-4.735c-0.032-0.343-0.314-0.609-0.645-0.577
+						c-0.332,0.03-0.581,0.335-0.539,0.679c0.196,1.66,0.991,3.093,2.027,4.129c1.044,1.042,2.336,1.691,3.747,1.761
+						c2.67,0.128,5.337,0.497,7.921,1.09c0.91,0.205,1.825,0.15,2.716-0.117l0.432-1.458c-0.916,0.416-1.893,0.569-2.867,0.35
+						C212.499,434.834,209.76,434.455,207.018,434.323z"/>
+					<path fill="#988A38" d="M207.011,434.451c-2.255-0.119-4.349-2.011-4.723-4.625c-0.037-0.296-0.279-0.517-0.562-0.484
+						c-0.283,0.033-0.494,0.299-0.453,0.594c0.229,1.59,1.021,2.952,2.039,3.935c1.026,0.984,2.285,1.599,3.646,1.666
+						c2.672,0.128,5.342,0.497,7.928,1.091c0.911,0.205,1.827,0.146,2.722-0.125l0.372-1.257c-0.915,0.397-1.887,0.541-2.851,0.323
+						C212.478,434.961,209.747,434.583,207.011,434.451z"/>
+					<path fill="#988A38" d="M207.005,434.58c-2.233-0.119-4.339-1.933-4.793-4.515c-0.04-0.248-0.245-0.427-0.481-0.394
+						c-0.236,0.033-0.407,0.265-0.365,0.51c0.264,1.521,1.052,2.812,2.052,3.742c1.006,0.927,2.231,1.507,3.542,1.57
+						c2.674,0.129,5.347,0.498,7.935,1.092c0.911,0.205,1.83,0.144,2.728-0.135l0.312-1.055c-0.915,0.38-1.879,0.513-2.834,0.298
+						C212.457,435.088,209.732,434.711,207.005,434.58z"/>
+					<path fill="#988A38" d="M206.999,434.708c-2.21-0.119-4.313-1.858-4.864-4.401c-0.04-0.201-0.21-0.337-0.398-0.302
+						c-0.19,0.034-0.321,0.225-0.279,0.425c0.656,2.895,3.021,4.887,5.504,5.021c2.677,0.129,5.352,0.499,7.943,1.093
+						c0.911,0.206,1.832,0.141,2.732-0.142l0.253-0.854c-0.916,0.365-1.873,0.485-2.818,0.271
+						C212.436,435.214,209.719,434.839,206.999,434.708z"/>
+					<path fill="#FEE676" d="M206.992,434.836c-2.188-0.117-4.263-1.792-4.934-4.29c-0.037-0.148-0.175-0.247-0.317-0.211
+						c-0.143,0.037-0.233,0.189-0.193,0.341c0.76,2.752,3.028,4.603,5.416,4.731c2.68,0.129,5.357,0.499,7.951,1.094
+						c0.91,0.205,1.834,0.136,2.737-0.148l0.193-0.654c-0.915,0.347-1.865,0.457-2.802,0.245
+						C212.415,435.341,209.705,434.967,206.992,434.836z"/>
+					<path fill="#FEE676" d="M206.986,434.964c-2.167-0.113-4.159-1.748-5.002-4.178c-0.035-0.104-0.141-0.157-0.238-0.119
+						c-0.096,0.038-0.145,0.151-0.109,0.256c0.919,2.587,3.036,4.32,5.33,4.441c2.682,0.129,5.361,0.5,7.958,1.095
+						c0.911,0.205,1.836,0.133,2.743-0.158l0.134-0.452c-0.915,0.331-1.859,0.43-2.787,0.221
+						C212.394,435.468,209.691,435.095,206.986,434.964z"/>
+					<path d="M215.574,433.633c-2.771-0.636-5.614-1.028-8.465-1.166c-1.634-0.081-3.231-1.559-3.288-3.398
+						c0.061,2.122,1.404,3.812,3.263,3.898c2.822,0.136,5.637,0.524,8.378,1.153c1.197,0.273,2.324-0.187,3.174-1.094l0.103-0.348
+						C217.87,433.487,216.665,433.879,215.574,433.633z"/>
+					<path d="M204.298,427.239c0.208-0.359,0.48-0.673,0.805-0.938l-0.005-0.013c-0.815,0.464-3.499,0.43-3.182-1.161
+						c-0.427,0.675-0.706,1.436-0.854,2.239C202.141,427.285,203.22,427.253,204.298,427.239z"/>
+					<path d="M207.61,422.28c-2.051-0.1-3.863,0.692-5.048,1.98c-1.195,1.285-1.763,3.062-1.616,4.938
+						c-0.137-1.738,0.508-3.395,1.738-4.59c1.219-1.196,3.017-1.923,4.902-1.83c3.415,0.167,6.808,0.632,10.161,1.4
+						c1.163,0.263,2.211,0.84,3.035,1.6l0.143-0.482c-0.811-0.752-1.854-1.33-3.066-1.604
+						C214.475,422.917,211.054,422.448,207.61,422.28z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M190.138,433.308l1.412,3.771c1.142-1.616,1.797-4.04,1.149-6.485c-0.923,0.26-1.842,0.535-2.742,0.864
+						C190.18,432.08,190.213,432.706,190.138,433.308z"/>
+					<path d="M189.958,431.458c0.201,0.56,0.209,1.131,0.119,1.686l0.127,0.34C190.264,432.833,190.205,432.147,189.958,431.458z"/>
+					<path fill="#988A38" d="M189.958,431.458c0.9-0.329,1.819-0.604,2.742-0.864c-0.641-2.428-2.636-4.207-5.235-4.427l1.413,3.772
+						C189.354,430.32,189.736,430.841,189.958,431.458z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M192.654,430.605c-0.116-0.427-0.547-0.675-0.964-0.554c-0.417,0.116-0.639,0.575-0.504,0.992
+								c0.464,1.479,0.235,2.982-0.335,4.164l0.681,1.817C192.653,435.417,193.297,433.021,192.654,430.605z"/>
+							<path fill="#988A38" d="M192.648,430.869c-0.097-0.384-0.48-0.607-0.849-0.505c-0.371,0.098-0.567,0.501-0.457,0.876
+								c0.42,1.471,0.143,2.954-0.438,4.114l0.607,1.622C192.582,435.435,193.221,433.169,192.648,430.869z"/>
+							<path fill="#988A38" d="M192.643,431.133c-0.078-0.337-0.413-0.539-0.736-0.456c-0.323,0.081-0.497,0.43-0.41,0.762
+								c0.375,1.462,0.053,2.925-0.538,4.063l0.533,1.425C192.509,435.451,193.143,433.316,192.643,431.133z"/>
+							<path fill="#988A38" d="M192.641,431.396c-0.062-0.294-0.348-0.474-0.625-0.408c-0.278,0.064-0.428,0.357-0.361,0.648
+								c0.327,1.457-0.038,2.894-0.638,4.014l0.459,1.228C192.439,435.473,193.064,433.465,192.641,431.396z"/>
+							<path fill="#988A38" d="M192.637,431.657c-0.044-0.246-0.28-0.404-0.513-0.355c-0.233,0.048-0.365,0.284-0.316,0.531
+								c0.275,1.457-0.125,2.865-0.737,3.965l0.386,1.03C192.369,435.497,192.981,433.617,192.637,431.657z"/>
+							<path fill="#988A38" d="M192.635,431.92c-0.029-0.201-0.215-0.337-0.404-0.306c-0.187,0.033-0.301,0.217-0.27,0.418
+								c0.217,1.455-0.213,2.835-0.835,3.913l0.312,0.834C192.298,435.519,192.893,433.77,192.635,431.92z"/>
+							<path fill="#FEE676" d="M192.634,432.183c-0.015-0.155-0.151-0.27-0.295-0.257c-0.145,0.016-0.24,0.151-0.225,0.307
+								c0.143,1.451-0.3,2.804-0.933,3.86l0.239,0.64C192.227,435.539,192.791,433.921,192.634,432.183z"/>
+							<path fill="#FEE676" d="M192.634,432.444c-0.002-0.11-0.086-0.202-0.188-0.204c-0.103-0.003-0.183,0.085-0.18,0.192
+								c0.036,1.446-0.385,2.772-1.03,3.807l0.166,0.442C192.156,435.563,192.659,434.071,192.634,432.444z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M190.008,431.365c0.111,0.309,0.438,0.478,0.73,0.374c0.292-0.099,0.452-0.426,0.352-0.739
+								c-0.454-1.435-1.525-2.496-2.914-2.934l0.617,1.646C189.337,430.112,189.766,430.685,190.008,431.365z"/>
+							<path fill="#988A38" d="M189.984,431.215c0.104,0.274,0.393,0.421,0.655,0.327c0.258-0.095,0.4-0.386,0.307-0.664
+								c-0.444-1.318-1.451-2.29-2.731-2.707l0.551,1.471C189.302,430.021,189.736,430.563,189.984,431.215z"/>
+							<path fill="#988A38" d="M189.961,431.067c0.095,0.236,0.35,0.362,0.579,0.275c0.226-0.089,0.349-0.345,0.261-0.588
+								c-0.434-1.202-1.373-2.08-2.547-2.48l0.484,1.294C189.266,429.93,189.707,430.442,189.961,431.067z"/>
+							<path fill="#988A38" d="M189.937,430.916c0.088,0.202,0.307,0.308,0.503,0.229c0.194-0.08,0.296-0.306,0.215-0.511
+								c-0.425-1.086-1.294-1.875-2.363-2.256l0.419,1.119C189.229,429.838,189.676,430.318,189.937,430.916z"/>
+							<path fill="#988A38" d="M189.915,430.769c0.079,0.164,0.261,0.247,0.425,0.177c0.162-0.069,0.241-0.266,0.169-0.432
+								c-0.416-0.967-1.217-1.673-2.178-2.031l0.352,0.94C189.193,429.745,189.643,430.197,189.915,430.769z"/>
+							<path fill="#988A38" d="M189.891,430.618c0.069,0.131,0.218,0.191,0.348,0.129c0.129-0.061,0.188-0.222,0.124-0.354
+								c-0.408-0.846-1.136-1.47-1.992-1.807l0.287,0.766C189.157,429.654,189.608,430.077,189.891,430.618z"/>
+							<path fill="#FEE676" d="M189.868,430.467c0.058,0.097,0.174,0.137,0.271,0.083c0.096-0.053,0.131-0.178,0.077-0.275
+								c-0.409-0.725-1.056-1.269-1.806-1.584l0.22,0.587C189.121,429.562,189.567,429.958,189.868,430.467z"/>
+							<path fill="#FEE676" d="M189.846,430.317c0.045,0.063,0.129,0.076,0.191,0.034c0.062-0.046,0.076-0.13,0.031-0.194
+								c-0.419-0.595-0.975-1.071-1.619-1.361l0.153,0.409C189.085,429.471,189.512,429.85,189.846,430.317z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M192.7,430.594c-0.671-2.54-2.658-4.427-5.322-4.659l0.176,0.47C190.088,426.613,192.091,428.285,192.7,430.594z"/>
+					</g>
+					<path d="M191.972,428.906c-0.411-0.707-0.936-1.326-1.579-1.812c0.657,1.064-0.702,1.865-1.777,2.149l0.252,0.674
+						C189.892,429.539,190.926,429.195,191.972,428.906z"/>
+					<path fill="#988A38" d="M206.178,430.509c0.768,0.026,1.433-0.613,1.483-1.437c0.051-0.83-0.574-1.534-1.396-1.562
+						c-5.832-0.158-11.748,0.574-17.309,2.642l1.052,2.81C195.106,431.062,200.674,430.355,206.178,430.509z"/>
+					<path d="M206.258,427.76c0.683,0.022,1.447,0.62,1.404,1.312c0.058-0.966-0.427-1.779-1.389-1.812
+						c-5.859-0.159-11.804,0.575-17.404,2.657l0.176,0.469C194.566,428.332,200.454,427.602,206.258,427.76z"/>
+					<g>
+						<path fill="#988A38" d="M206.366,430.315c0.412,0.017,0.764-0.327,0.786-0.77c0.022-0.441-0.307-0.813-0.734-0.831
+							c-5.762-0.176-11.606,0.537-17.041,2.561l0.562,1.499C195.123,430.842,200.78,430.143,206.366,430.315z"/>
+						<path fill="#988A38" d="M205.94,430.244c0.369,0.005,0.681-0.298,0.696-0.693c0.016-0.392-0.281-0.728-0.662-0.733
+							c-5.605-0.126-11.278,0.601-16.557,2.564l0.5,1.337C194.979,430.833,200.488,430.12,205.94,430.244z"/>
+						<path fill="#988A38" d="M205.513,430.178c0.325,0.006,0.597-0.271,0.607-0.617c0.01-0.348-0.253-0.634-0.588-0.641
+							c-5.448-0.075-10.953,0.66-16.077,2.566l0.441,1.177C194.835,430.825,200.197,430.103,205.513,430.178z"/>
+						<path fill="#988A38" d="M205.084,430.113c0.282,0,0.514-0.235,0.52-0.536c0.006-0.299-0.224-0.551-0.512-0.55
+							c-5.292-0.031-10.626,0.718-15.597,2.564l0.381,1.018C194.69,430.819,199.904,430.082,205.084,430.113z"/>
+						<path fill="#988A38" d="M204.656,430.055c0.236,0,0.432-0.206,0.434-0.458c0.002-0.253-0.193-0.458-0.435-0.458
+							c-5.135,0.009-10.303,0.768-15.12,2.558l0.321,0.858C194.545,430.812,199.612,430.062,204.656,430.055z"/>
+						<path fill="#FEE676" d="M204.227,429.994c0.192,0,0.349-0.165,0.348-0.371c0-0.206-0.16-0.372-0.355-0.372
+							c-4.979,0.058-9.98,0.812-14.645,2.551l0.261,0.695C194.398,430.796,199.32,430.05,204.227,429.994z"/>
+						<path fill="#FEE676" d="M203.795,429.944c0.148-0.003,0.269-0.134,0.267-0.292c-0.002-0.156-0.126-0.281-0.277-0.278
+							c-4.822,0.092-9.657,0.854-14.17,2.534l0.2,0.534C194.251,430.79,199.027,430.035,203.795,429.944z"/>
+						<path fill="#FEE676" d="M203.363,429.894c0.104-0.002,0.187-0.093,0.185-0.203s-0.09-0.198-0.195-0.196
+							c-4.666,0.128-9.337,0.895-13.699,2.519l0.14,0.374C194.104,430.782,198.734,430.021,203.363,429.894z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M186.4,432.894c0.375,0.737,1.223,1.063,1.909,0.759c0.56-0.244,1.125-0.477,1.699-0.691l-1.052-2.809
+						c-0.625,0.233-1.245,0.487-1.859,0.756C186.33,431.248,186.024,432.155,186.4,432.894z"/>
+					<g>
+						<path fill="#988A38" d="M186.985,433.162c0.192,0.395,0.646,0.578,1.016,0.412c0.634-0.29,1.283-0.552,1.938-0.8l-0.562-1.499
+							c-0.687,0.259-1.367,0.534-2.035,0.84C186.95,432.292,186.791,432.762,186.985,433.162z"/>
+						<path fill="#988A38" d="M187.275,433.054c0.169,0.356,0.574,0.523,0.904,0.373c0.572-0.251,1.151-0.488,1.739-0.708
+							l-0.5-1.337c-0.612,0.229-1.217,0.478-1.816,0.74C187.253,432.28,187.106,432.697,187.275,433.054z"/>
+						<path fill="#988A38" d="M187.567,432.949c0.146,0.315,0.499,0.462,0.792,0.333c0.508-0.217,1.019-0.426,1.537-0.618
+							l-0.44-1.178c-0.539,0.2-1.069,0.417-1.597,0.643C187.551,432.264,187.421,432.634,187.567,432.949z"/>
+						<path fill="#988A38" d="M187.857,432.847c0.123,0.273,0.428,0.4,0.686,0.296c0.439-0.189,0.885-0.365,1.333-0.533
+							l-0.381-1.018c-0.463,0.174-0.924,0.355-1.378,0.552C187.85,432.252,187.734,432.571,187.857,432.847z"/>
+						<path fill="#988A38" d="M188.149,432.744c0.102,0.233,0.36,0.349,0.577,0.259c0.371-0.162,0.751-0.303,1.129-0.448
+							l-0.321-0.858c-0.389,0.149-0.779,0.295-1.161,0.461C188.149,432.251,188.048,432.512,188.149,432.744z"/>
+						<path fill="#FEE676" d="M188.444,432.65c0.081,0.189,0.289,0.284,0.467,0.21c0.304-0.131,0.617-0.24,0.924-0.362l-0.261-0.696
+							c-0.315,0.124-0.634,0.236-0.945,0.37C188.446,432.248,188.363,432.46,188.444,432.65z"/>
+						<path fill="#FEE676" d="M188.737,432.555c0.062,0.149,0.222,0.222,0.359,0.164l0.718-0.275l-0.2-0.534l-0.73,0.28
+							C188.744,432.248,188.676,432.407,188.737,432.555z"/>
+						<path fill="#FEE676" d="M189.031,432.462c0.042,0.104,0.156,0.158,0.254,0.12l0.508-0.195l-0.14-0.374l-0.514,0.197
+							C189.039,432.248,188.989,432.357,189.031,432.462z"/>
+					</g>
+					<path d="M186.4,432.894c-0.315-0.617,0.163-1.478,0.798-1.758c0.609-0.266,1.224-0.519,1.846-0.75l-0.175-0.469
+						c-0.63,0.235-1.254,0.491-1.873,0.762C186.096,431.08,185.965,432.034,186.4,432.894z"/>
+					<path fill="#988A38" d="M174.619,433.471c-1.589,1.618-2.243,3.866-1.888,5.497c0.29,1.726,1.335,3.212,2.896,4.146
+						c1.564,0.938,3.324,1.198,4.88,1.015c1.624-0.275,2.771-0.636,3.627-1.51c1.472-1.542,3.358-2.847,5.509-3.887
+						c0.762-0.362,1.385-0.938,1.906-1.652l-1.413-3.772c-0.162,1.204-0.793,2.239-1.797,2.723c-2.417,1.169-4.608,2.669-6.369,4.51
+						c-0.552,0.567-1.307,0.852-2.161,0.893c-0.842,0.036-1.805-0.167-2.617-0.73c-0.817-0.568-1.331-1.401-1.425-2.361
+						c-0.096-0.947,0.241-1.997,1.011-2.791c2.45-2.557,5.373-4.526,8.43-6.005c1.287-0.619,2.708-0.392,3.666,0.395l-1.413-3.771
+						c-1.142-0.104-2.379,0.112-3.557,0.676C180.583,428.45,177.356,430.615,174.619,433.471z"/>
+					<path fill="#988A38" d="M175.813,434.619c-1.094,1.131-1.566,2.64-1.404,3.934c0.194,1.263,0.92,2.384,2.044,3.119
+						c0.275,0.18,0.634,0.114,0.808-0.143c0.176-0.255,0.101-0.616-0.167-0.805c-0.836-0.58-1.369-1.429-1.462-2.413
+						c-0.101-0.973,0.248-2.051,1.04-2.868c2.464-2.571,5.402-4.552,8.474-6.036c1.271-0.611,2.672-0.425,3.647,0.306l-0.617-1.646
+						c-1.086-0.349-2.358-0.301-3.548,0.269C181.451,429.871,178.392,431.929,175.813,434.619z"/>
+					<path fill="#988A38" d="M175.878,434.68c-1.054,1.092-1.522,2.528-1.396,3.787c0.163,1.222,0.833,2.307,1.893,3.034
+						c0.242,0.165,0.562,0.107,0.723-0.119c0.159-0.228,0.093-0.547-0.143-0.72c-0.804-0.585-1.307-1.43-1.378-2.405
+						c-0.078-0.965,0.281-2.028,1.068-2.841c2.468-2.575,5.41-4.558,8.484-6.043c1.27-0.611,2.658-0.439,3.637,0.269l-0.551-1.471
+						c-1.078-0.368-2.352-0.329-3.548,0.244C181.498,429.947,178.448,431.998,175.878,434.68z"/>
+					<path fill="#988A38" d="M175.941,434.742c-1.011,1.051-1.48,2.417-1.381,3.635c0.123,1.187,0.745,2.229,1.734,2.95
+						c0.209,0.151,0.491,0.101,0.635-0.096c0.145-0.196,0.089-0.477-0.111-0.63c-0.773-0.594-1.246-1.434-1.295-2.4
+						c-0.056-0.955,0.313-2.004,1.094-2.81c2.472-2.579,5.417-4.564,8.495-6.052c1.27-0.61,2.643-0.456,3.625,0.229l-0.484-1.294
+						c-1.07-0.385-2.343-0.356-3.548,0.221C181.543,430.023,178.502,432.069,175.941,434.742z"/>
+					<path fill="#988A38" d="M176.006,434.805c-0.968,1.01-1.44,2.307-1.367,3.48c0.083,1.152,0.658,2.152,1.577,2.87
+						c0.175,0.133,0.42,0.095,0.548-0.072c0.127-0.168,0.083-0.407-0.086-0.544c-0.742-0.602-1.183-1.437-1.211-2.395
+						c-0.034-0.946,0.345-1.979,1.122-2.779c2.475-2.584,5.425-4.57,8.506-6.061c1.27-0.61,2.628-0.469,3.615,0.192l-0.419-1.119
+						c-1.061-0.402-2.334-0.383-3.548,0.199C181.591,430.101,178.559,432.141,176.006,434.805z"/>
+					<path fill="#988A38" d="M176.071,434.867c-0.926,0.971-1.398,2.195-1.35,3.325c0.041,1.115,0.574,2.075,1.418,2.789
+						c0.141,0.117,0.35,0.089,0.46-0.048c0.109-0.14,0.078-0.338-0.057-0.456c-0.71-0.614-1.12-1.445-1.127-2.393
+						c-0.013-0.936,0.375-1.951,1.147-2.745c2.479-2.587,5.432-4.577,8.517-6.067c1.27-0.61,2.613-0.486,3.604,0.15l-0.353-0.94
+						c-1.051-0.421-2.326-0.41-3.548,0.176C181.638,430.179,178.616,432.213,176.071,434.867z"/>
+					<path fill="#988A38" d="M176.135,434.929c-0.888,0.928-1.349,2.089-1.331,3.166c-0.001,1.077,0.493,1.999,1.258,2.713
+						c0.108,0.1,0.279,0.083,0.372-0.024c0.094-0.108,0.074-0.27-0.031-0.368c-1.38-1.198-1.427-3.498,0.132-5.103
+						c2.482-2.591,5.44-4.583,8.528-6.075c1.27-0.609,2.598-0.5,3.594,0.114l-0.287-0.765c-1.044-0.439-2.319-0.438-3.548,0.151
+						C181.685,430.255,178.671,432.283,176.135,434.929z"/>
+					<path fill="#FEE676" d="M176.2,434.99c-0.851,0.885-1.294,1.978-1.304,3c-0.031,1.031,0.405,1.921,1.09,2.643
+						c0.079,0.081,0.209,0.078,0.284,0.001c0.076-0.079,0.07-0.201-0.006-0.281c-1.289-1.247-1.3-3.478,0.243-5.066
+						c2.486-2.595,5.448-4.59,8.539-6.084c1.27-0.609,2.583-0.515,3.583,0.076l-0.22-0.588c-1.035-0.456-2.312-0.465-3.548,0.128
+						C181.731,430.331,178.727,432.354,176.2,434.99z"/>
+					<path fill="#FEE676" d="M176.265,435.053c-0.813,0.842-1.233,1.862-1.269,2.828c-0.048,0.979,0.313,1.846,0.916,2.578
+						c0.05,0.062,0.135,0.07,0.194,0.023s0.066-0.133,0.021-0.19c-1.167-1.325-1.173-3.457,0.354-5.032
+						c2.49-2.599,5.455-4.596,8.55-6.092c1.269-0.609,2.568-0.529,3.572,0.037l-0.153-0.408c-1.027-0.477-2.304-0.494-3.549,0.103
+						C181.778,430.408,178.783,432.425,176.265,435.053z"/>
+					<path d="M187.75,429.342c0.407,0.128,0.781,0.32,1.107,0.58l0.012-0.005l-0.252-0.673c-0.33,0.088-0.64,0.122-0.866,0.084
+						L187.75,429.342z"/>
+					<path fill="#988A38" d="M182.945,441.476c-0.704,0.729-1.666,1.069-2.833,1.159c-1.183,0.127-2.498-0.114-3.643-0.86
+						c-0.367-0.242-0.865-0.148-1.092,0.221c-0.231,0.366-0.106,0.851,0.273,1.078c1.549,0.931,3.296,1.19,4.842,1.009
+						c1.613-0.275,2.751-0.626,3.604-1.498c1.478-1.547,3.369-2.855,5.524-3.898c0.763-0.362,1.388-0.938,1.907-1.66l-0.681-1.817
+						c-0.44,0.899-1.069,1.631-1.921,2.038C186.63,438.356,184.576,439.77,182.945,441.476z"/>
+					<path fill="#988A38" d="M183.038,441.563c-0.694,0.719-1.641,1.065-2.794,1.179c-1.172,0.149-2.477-0.056-3.625-0.762
+						c-0.334-0.21-0.771-0.125-0.968,0.204c-0.2,0.326-0.087,0.756,0.256,0.953c1.508,0.868,3.194,1.091,4.685,0.893
+						c1.557-0.292,2.644-0.62,3.477-1.476c1.481-1.551,3.376-2.862,5.536-3.906c0.764-0.362,1.389-0.943,1.907-1.672l-0.607-1.622
+						c-0.45,0.887-1.078,1.604-1.922,2.007C186.697,438.467,184.656,439.87,183.038,441.563z"/>
+					<path fill="#988A38" d="M183.13,441.653c-0.683,0.708-1.614,1.062-2.754,1.195c-1.161,0.173-2.455,0.004-3.609-0.664
+						c-0.3-0.175-0.677-0.099-0.844,0.188c-0.169,0.288-0.067,0.666,0.237,0.832c1.468,0.804,3.094,0.989,4.529,0.772
+						c1.5-0.31,2.534-0.613,3.347-1.453c1.485-1.555,3.385-2.869,5.548-3.915c0.764-0.363,1.389-0.949,1.908-1.683l-0.534-1.424
+						c-0.458,0.871-1.086,1.577-1.92,1.974C186.764,438.577,184.736,439.973,183.13,441.653z"/>
+					<path fill="#988A38" d="M183.224,441.743c-0.673,0.698-1.586,1.056-2.712,1.21c-1.149,0.195-2.432,0.061-3.596-0.566
+						c-0.263-0.144-0.58-0.073-0.72,0.175c-0.138,0.249-0.044,0.574,0.22,0.712c1.43,0.737,2.995,0.886,4.376,0.652
+						c0.348-0.057,0.685-0.143,1.003-0.25c0.304-0.069,0.583-0.161,0.844-0.273c0.535-0.214,0.982-0.514,1.368-0.907
+						c1.49-1.56,3.394-2.876,5.561-3.925c0.765-0.362,1.389-0.953,1.908-1.692l-0.459-1.228c-0.468,0.856-1.096,1.55-1.92,1.942
+						C186.832,438.688,184.817,440.075,183.224,441.743z"/>
+					<path fill="#988A38" d="M183.316,441.832c-0.663,0.688-1.557,1.051-2.668,1.225c-1.136,0.216-2.408,0.115-3.584-0.469
+						c-0.224-0.112-0.486-0.048-0.598,0.162c-0.11,0.211-0.022,0.482,0.203,0.592c1.394,0.669,2.898,0.779,4.226,0.531
+						c0.332-0.056,0.661-0.162,0.956-0.246c0.292-0.07,0.555-0.166,0.809-0.273c0.512-0.215,0.943-0.508,1.316-0.888
+						c1.494-1.563,3.402-2.883,5.573-3.933c0.765-0.363,1.39-0.959,1.907-1.705l-0.386-1.03c-0.477,0.842-1.105,1.523-1.919,1.911
+						C186.899,438.798,184.897,440.177,183.316,441.832z"/>
+					<path fill="#988A38" d="M183.408,441.92c-0.653,0.678-1.527,1.045-2.621,1.234c-1.121,0.236-2.383,0.169-3.57-0.363
+						c-0.185-0.083-0.396-0.025-0.476,0.147c-0.083,0.174-0.004,0.391,0.183,0.473c1.361,0.593,2.804,0.665,4.077,0.404
+						c0.654-0.161,1.202-0.302,1.681-0.515c0.488-0.214,0.904-0.499,1.262-0.865c1.498-1.568,3.411-2.891,5.585-3.941
+						c0.766-0.364,1.391-0.964,1.908-1.715l-0.312-0.834c-0.486,0.829-1.114,1.495-1.919,1.879
+						C186.966,438.907,184.978,440.278,183.408,441.92z"/>
+					<path fill="#FEE676" d="M183.501,442.01c-0.643,0.667-1.497,1.032-2.572,1.233c-1.105,0.254-2.359,0.217-3.561-0.25
+						c-0.143-0.057-0.303-0.002-0.356,0.134c-0.053,0.136,0.018,0.298,0.166,0.354c1.332,0.507,2.715,0.541,3.932,0.269
+						c1.224-0.288,2.105-0.616,2.803-1.344c1.502-1.572,3.419-2.897,5.598-3.95c0.766-0.364,1.391-0.97,1.909-1.723l-0.239-0.64
+						c-0.495,0.814-1.122,1.468-1.918,1.847C187.033,439.018,185.058,440.38,183.501,442.01z"/>
+					<path fill="#FEE676" d="M183.593,442.099c-0.631,0.655-1.464,1.013-2.517,1.214c-1.087,0.264-2.333,0.255-3.557-0.12
+						c-0.104-0.031-0.209,0.022-0.234,0.123c-0.025,0.1,0.039,0.205,0.144,0.237c1.312,0.392,2.634,0.389,3.795,0.11
+						c1.146-0.26,1.991-0.594,2.657-1.287c1.506-1.577,3.428-2.904,5.61-3.96c0.767-0.364,1.392-0.973,1.909-1.734l-0.166-0.442
+						c-0.503,0.801-1.131,1.442-1.917,1.816C187.101,439.128,185.138,440.481,183.593,442.099z"/>
+					<path d="M188.231,435.804c-2.44,1.18-4.656,2.696-6.44,4.562c-0.522,0.534-1.263,0.843-2.094,0.924
+						c-0.817,0.079-1.747-0.063-2.502-0.588c0.869,0.604,1.864,0.867,2.731,0.873c0.876-0.002,1.645-0.263,2.225-0.863
+						c1.736-1.815,3.902-3.3,6.297-4.458c1.045-0.503,1.63-1.534,1.755-2.771l-0.127-0.34
+						C189.874,434.312,189.19,435.342,188.231,435.804z"/>
+					<path d="M176.008,439.233c-0.158-0.388-0.241-0.812-0.237-1.247l-0.014-0.004c-0.196,1.009-1.763,3.144-2.879,1.969
+						c0.3,0.747,0.752,1.415,1.322,2.003C174.771,441.025,175.359,440.131,176.008,439.233z"/>
+					<path d="M174.438,433.298c-0.823,0.849-1.4,1.838-1.72,2.835c-0.345,1.034-0.322,1.902-0.163,2.782
+						c0.337,1.74,1.453,3.229,3.071,4.199c-1.5-0.897-2.474-2.382-2.716-4.096c-0.311-1.623,0.378-3.841,1.887-5.375
+						c2.713-2.83,5.916-4.979,9.217-6.575c1.149-0.549,2.387-0.766,3.537-0.663l-0.176-0.471c-1.137-0.104-2.375,0.108-3.578,0.684
+						C180.453,428.235,177.2,430.417,174.438,433.298z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M172.657,454.348l3.86,1.148c-0.573-1.99-2.157-3.94-4.52-4.846c-0.333,0.9-0.643,1.809-0.928,2.725
+						C171.699,453.569,172.229,453.922,172.657,454.348z"/>
+					<path d="M171.07,453.375c0.565,0.175,1.038,0.514,1.419,0.923l0.349,0.104C172.362,453.957,171.767,453.59,171.07,453.375z"/>
+					<path fill="#988A38" d="M171.07,453.375c0.285-0.916,0.595-1.824,0.928-2.725c-2.347-0.899-4.943-0.32-6.648,1.523l3.86,1.148
+						C169.806,453.178,170.44,453.181,171.07,453.375z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M171.981,450.695c-0.413-0.155-0.868,0.048-1.015,0.454c-0.145,0.406,0.086,0.853,0.504,0.992
+								c1.468,0.5,2.543,1.58,3.132,2.785l1.86,0.553C175.884,453.512,174.314,451.586,171.981,450.695z"/>
+							<path fill="#988A38" d="M172.191,450.854c-0.367-0.146-0.774,0.034-0.908,0.391c-0.135,0.358,0.07,0.753,0.438,0.886
+								c1.436,0.531,2.465,1.639,3.031,2.841l1.659,0.494C175.858,453.586,174.388,451.734,172.191,450.854z"/>
+							<path fill="#988A38" d="M172.402,451.013c-0.319-0.134-0.68,0.019-0.803,0.329c-0.122,0.309,0.056,0.655,0.374,0.78
+								c1.401,0.562,2.387,1.694,2.93,2.894l1.458,0.435C175.829,453.66,174.461,451.886,172.402,451.013z"/>
+							<path fill="#988A38" d="M172.614,451.169c-0.274-0.122-0.588,0.005-0.699,0.269c-0.112,0.263,0.04,0.559,0.312,0.676
+								c1.368,0.597,2.307,1.75,2.829,2.948l1.256,0.373C175.802,453.734,174.535,452.037,172.614,451.169z"/>
+							<path fill="#988A38" d="M172.823,451.326c-0.225-0.108-0.493-0.01-0.59,0.207c-0.099,0.218,0.019,0.465,0.244,0.571
+								c1.336,0.638,2.231,1.806,2.729,3.001l1.055,0.314C175.775,453.811,174.609,452.195,172.823,451.326z"/>
+							<path fill="#988A38" d="M173.034,451.481c-0.179-0.095-0.399-0.022-0.486,0.149c-0.085,0.171,0,0.373,0.181,0.467
+								c1.3,0.684,2.154,1.86,2.628,3.053l0.854,0.254C175.747,453.886,174.68,452.357,173.034,451.481z"/>
+							<path fill="#FEE676" d="M173.246,451.637c-0.134-0.08-0.307-0.036-0.381,0.09c-0.073,0.128-0.021,0.285,0.115,0.365
+								c1.256,0.741,2.078,1.913,2.528,3.104l0.654,0.195C175.718,453.961,174.743,452.531,173.246,451.637z"/>
+							<path fill="#FEE676" d="M173.458,451.791c-0.09-0.063-0.214-0.049-0.277,0.033c-0.063,0.082-0.04,0.2,0.049,0.262
+								c1.191,0.827,2.002,1.966,2.428,3.154l0.453,0.135C175.69,454.036,174.789,452.73,173.458,451.791z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M171.026,453.279c0.313,0.096,0.645-0.065,0.74-0.362c0.092-0.296-0.077-0.62-0.388-0.726
+								c-1.424-0.482-2.91-0.251-4.085,0.561l1.686,0.501C169.628,453.067,170.333,453.063,171.026,453.279z"/>
+							<path fill="#988A38" d="M170.892,453.208c0.281,0.081,0.573-0.063,0.657-0.33c0.078-0.265-0.077-0.554-0.354-0.644
+								c-1.323-0.422-2.7-0.191-3.794,0.549l1.505,0.448C169.534,453.04,170.218,453.015,170.892,453.208z"/>
+							<path fill="#988A38" d="M170.759,453.138c0.246,0.065,0.5-0.062,0.569-0.298c0.066-0.234-0.075-0.488-0.32-0.561
+								c-1.221-0.362-2.487-0.135-3.503,0.536l1.325,0.394C169.439,453.015,170.103,452.966,170.759,453.138z"/>
+							<path fill="#988A38" d="M170.624,453.066c0.214,0.051,0.43-0.062,0.485-0.266c0.055-0.202-0.071-0.42-0.284-0.477
+								c-1.119-0.301-2.279-0.079-3.213,0.522l1.146,0.341C169.343,452.987,169.984,452.917,170.624,453.066z"/>
+							<path fill="#988A38" d="M170.493,452.996c0.178,0.035,0.354-0.061,0.397-0.234c0.043-0.169-0.068-0.351-0.246-0.392
+								c-1.02-0.239-2.073-0.024-2.925,0.509l0.962,0.286C169.248,452.961,169.868,452.871,170.493,452.996z"/>
+							<path fill="#988A38" d="M170.358,452.926c0.146,0.022,0.284-0.06,0.312-0.2c0.029-0.14-0.064-0.282-0.209-0.309
+								c-0.923-0.177-1.862,0.029-2.636,0.493l0.783,0.233C169.153,452.936,169.751,452.826,170.358,452.926z"/>
+							<path fill="#FEE676" d="M170.223,452.854c0.112,0.012,0.213-0.057,0.229-0.166c0.016-0.107-0.063-0.21-0.174-0.225
+								c-0.828-0.104-1.649,0.085-2.346,0.479l0.601,0.179C169.057,452.908,169.631,452.788,170.223,452.854z"/>
+							<path fill="#FEE676" d="M170.09,452.782c0.077,0.002,0.138-0.057,0.142-0.131c0-0.077-0.059-0.138-0.137-0.141
+								c-0.731-0.021-1.439,0.137-2.055,0.463l0.418,0.125C168.963,452.883,169.511,452.768,170.09,452.782z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M171.998,450.65c-2.455-0.94-5.134-0.43-6.887,1.452l0.481,0.144C167.252,450.442,169.766,449.794,171.998,450.65z"/>
+					</g>
+					<path d="M170.205,450.252c-0.812-0.082-1.624-0.025-2.389,0.198c1.246,0.102,1.096,1.65,0.682,2.66l0.689,0.205
+						C169.495,452.285,169.829,451.262,170.205,450.252z"/>
+					<path fill="#988A38" d="M179.742,439.759c0.476-0.554,0.435-1.445-0.149-2.039c-0.576-0.587-1.549-0.585-2.111,0.066
+						c-0.501,0.568-0.966,1.169-1.412,1.784c-0.426,0.603-0.835,1.211-1.23,1.831c-0.788,1.241-1.513,2.519-2.177,3.827
+						c-1.33,2.616-2.403,5.354-3.236,8.158l2.875,0.855c0.784-2.64,1.793-5.21,3.037-7.657c0.621-1.224,1.298-2.417,2.033-3.573
+						c0.367-0.578,0.751-1.147,1.144-1.704C178.903,440.775,179.306,440.254,179.742,439.759z"/>
+					<path d="M177.67,437.95c0.462-0.539,1.442-0.723,1.923-0.23c-0.671-0.681-1.636-0.862-2.299-0.099
+						c-0.507,0.575-0.977,1.183-1.428,1.804c-0.429,0.607-0.84,1.218-1.237,1.843c-0.792,1.247-1.521,2.532-2.189,3.848
+						c-1.336,2.631-2.416,5.383-3.253,8.2l0.479,0.143c0.829-2.79,1.897-5.515,3.22-8.116c0.66-1.302,1.381-2.573,2.165-3.807
+						C175.828,440.316,176.669,439.084,177.67,437.95z"/>
+					<g>
+						<path fill="#988A38" d="M179.709,439.497c0.258-0.299,0.225-0.778-0.092-1.084c-0.319-0.309-0.822-0.306-1.105,0.021
+							c-1.92,2.197-3.402,4.692-4.735,7.254c-1.314,2.574-2.378,5.272-3.199,8.041l1.535,0.457c0.795-2.681,1.823-5.288,3.091-7.771
+							c0.633-1.241,1.324-2.452,2.073-3.625C178.038,441.605,178.814,440.494,179.709,439.497z"/>
+						<path fill="#988A38" d="M179.389,439.766c0.236-0.265,0.191-0.697-0.099-0.964c-0.29-0.266-0.732-0.25-0.987,0.037
+							c-3.53,4.394-6.006,9.55-7.617,14.923l1.368,0.406c0.778-2.614,1.772-5.162,2.998-7.591c0.612-1.215,1.28-2.4,2.003-3.55
+							C177.789,441.874,178.542,440.754,179.389,439.766z"/>
+						<path fill="#988A38" d="M179.083,440.048c0.194-0.245,0.149-0.623-0.11-0.85c-0.264-0.23-0.654-0.21-0.863,0.053
+							c-3.361,4.309-5.756,9.32-7.317,14.542l1.205,0.358C173.508,449.076,175.877,444.155,179.083,440.048z"/>
+						<path fill="#988A38" d="M178.776,440.333c0.172-0.211,0.123-0.537-0.107-0.729c-0.231-0.192-0.565-0.166-0.748,0.058
+							c-3.191,4.228-5.506,9.092-7.02,14.163l1.042,0.31C173.416,449.184,175.7,444.406,178.776,440.333z"/>
+						<path fill="#988A38" d="M178.479,440.628c0.135-0.185,0.086-0.461-0.109-0.615c-0.2-0.159-0.48-0.13-0.623,0.064
+							c-3.039,4.14-5.273,8.856-6.739,13.78l0.878,0.261C173.321,449.291,175.523,444.653,178.479,440.628z"/>
+						<path fill="#FEE676" d="M178.182,440.926c0.111-0.149,0.07-0.37-0.095-0.494c-0.165-0.123-0.392-0.096-0.507,0.061
+							c-2.896,4.05-5.044,8.622-6.464,13.396l0.712,0.212C173.226,449.398,175.342,444.898,178.182,440.926z"/>
+						<path fill="#FEE676" d="M177.889,441.238c0.084-0.122,0.048-0.296-0.08-0.387c-0.128-0.091-0.302-0.063-0.388,0.061
+							c-2.764,3.952-4.821,8.385-6.196,13.01l0.546,0.162C173.131,449.513,175.163,445.136,177.889,441.238z"/>
+						<path fill="#FEE676" d="M177.597,441.557c0.059-0.085,0.033-0.205-0.058-0.268s-0.213-0.044-0.273,0.043
+							c-2.629,3.853-4.606,8.144-5.935,12.621l0.383,0.114C173.032,449.625,174.992,445.373,177.597,441.557z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M170.012,457.067c0.812,0.18,1.599-0.285,1.793-1.028c0.153-0.603,0.318-1.201,0.498-1.796
+						l-2.875-0.855c-0.19,0.632-0.366,1.268-0.528,1.906C168.691,456.091,169.208,456.89,170.012,457.067z"/>
+					<g>
+						<path fill="#988A38" d="M170.593,456.782c0.433,0.101,0.853-0.146,0.956-0.545c0.174-0.688,0.357-1.373,0.562-2.052
+							l-1.535-0.456c-0.212,0.701-0.401,1.408-0.581,2.118C169.889,456.262,170.165,456.683,170.593,456.782z"/>
+						<path fill="#988A38" d="M170.69,456.486c0.382,0.091,0.764-0.127,0.857-0.483c0.155-0.615,0.325-1.227,0.507-1.834
+							l-1.368-0.407c-0.188,0.625-0.362,1.255-0.522,1.888C170.068,456.019,170.305,456.394,170.69,456.486z"/>
+						<path fill="#988A38" d="M170.789,456.189c0.336,0.083,0.677-0.104,0.757-0.421c0.142-0.541,0.29-1.081,0.452-1.616
+							l-1.205-0.358c-0.167,0.549-0.317,1.103-0.463,1.657C170.247,455.776,170.45,456.106,170.789,456.189z"/>
+						<path fill="#988A38" d="M170.891,455.896c0.292,0.073,0.585-0.089,0.655-0.362l0.396-1.398l-1.042-0.31l-0.404,1.429
+							C170.425,455.535,170.6,455.823,170.891,455.896z"/>
+						<path fill="#988A38" d="M170.993,455.602c0.247,0.064,0.492-0.072,0.557-0.301l0.336-1.182l-0.878-0.262l-0.343,1.204
+							C170.599,455.295,170.749,455.538,170.993,455.602z"/>
+						<path fill="#FEE676" d="M171.098,455.309c0.199,0.053,0.403-0.055,0.455-0.243l0.276-0.964l-0.713-0.212l-0.28,0.979
+							C170.783,455.06,170.897,455.256,171.098,455.309z"/>
+						<path fill="#FEE676" d="M171.204,455.018c0.156,0.042,0.312-0.042,0.353-0.187l0.216-0.746l-0.547-0.163l-0.218,0.755
+							C170.966,454.823,171.05,454.975,171.204,455.018z"/>
+						<path fill="#FEE676" d="M171.312,454.726c0.108,0.03,0.219-0.028,0.248-0.131l0.155-0.527l-0.383-0.113l-0.157,0.531
+							C171.146,454.589,171.204,454.695,171.312,454.726z"/>
+					</g>
+					<path d="M170.012,457.067c-0.67-0.147-1.046-1.052-0.871-1.711c0.161-0.637,0.336-1.27,0.525-1.898l-0.479-0.143
+						c-0.191,0.636-0.368,1.274-0.53,1.917C168.416,456.166,169.075,456.859,170.012,457.067z"/>
+					<path fill="#988A38" d="M162.271,465.363c-0.114,1.959,0.618,3.755,1.841,4.966c1.222,1.221,2.918,1.857,4.737,1.753
+						c1.82-0.104,3.423-0.884,4.595-1.955c1.17-1.08,1.917-2.453,2.005-3.985c0.163-2.679,0.536-5.339,1.147-7.926
+						c0.212-0.912,0.167-1.829-0.079-2.719l-3.861-1.149c0.855,0.862,1.289,2.036,1.019,3.184c-0.651,2.76-1.048,5.589-1.221,8.434
+						c-0.103,1.754-1.705,3.211-3.689,3.239c-1.984,0.029-3.618-1.616-3.499-3.664c0.197-3.243,0.649-6.477,1.399-9.652
+						c0.313-1.331,1.336-2.282,2.545-2.565l-3.86-1.148c-0.758,0.808-1.33,1.846-1.605,3.028
+						C162.954,458.553,162.478,461.954,162.271,465.363z"/>
+					<path fill="#988A38" d="M163.925,465.461c-0.17,2.843,2.108,5.072,4.785,4.964c0.328-0.009,0.59-0.269,0.584-0.58
+						c-0.008-0.311-0.272-0.564-0.601-0.563c-2.028,0.033-3.703-1.65-3.581-3.75c0.197-3.252,0.651-6.494,1.402-9.679
+						c0.308-1.312,1.289-2.271,2.462-2.599l-1.686-0.502c-0.928,0.633-1.652,1.615-1.936,2.829
+						C164.59,458.835,164.126,462.144,163.925,465.461z"/>
+					<path fill="#988A38" d="M164.014,465.465c-0.163,2.741,1.975,4.912,4.535,4.864c0.292-0.002,0.527-0.232,0.526-0.512
+						c-0.005-0.278-0.238-0.502-0.531-0.507c-1.982-0.015-3.59-1.698-3.469-3.783c0.197-3.253,0.652-6.497,1.403-9.684
+						c0.308-1.312,1.269-2.268,2.427-2.612l-1.505-0.448c-0.938,0.616-1.67,1.596-1.956,2.817
+						C164.678,458.85,164.215,462.153,164.014,465.465z"/>
+					<path fill="#988A38" d="M164.103,465.472c-0.156,2.636,1.84,4.744,4.282,4.761c0.256,0.005,0.466-0.197,0.466-0.442
+						c0.001-0.243-0.2-0.441-0.454-0.451c-1.941-0.063-3.479-1.746-3.359-3.812c0.197-3.255,0.652-6.502,1.404-9.691
+						c0.308-1.31,1.246-2.265,2.388-2.625l-1.325-0.395c-0.947,0.6-1.687,1.573-1.975,2.806
+						C164.765,458.865,164.303,462.164,164.103,465.472z"/>
+					<path fill="#988A38" d="M164.193,465.477c-0.148,2.533,1.703,4.574,4.03,4.659c0.219,0.01,0.403-0.162,0.408-0.372
+						c0.005-0.211-0.166-0.382-0.384-0.396c-1.897-0.114-3.367-1.793-3.247-3.843c0.198-3.258,0.652-6.506,1.405-9.698
+						c0.308-1.31,1.228-2.261,2.352-2.639l-1.145-0.34c-0.957,0.582-1.703,1.553-1.994,2.794
+						C164.854,458.881,164.393,462.175,164.193,465.477z"/>
+					<path fill="#988A38" d="M164.283,465.482c-0.141,2.432,1.566,4.396,3.778,4.556c0.184,0.015,0.339-0.127,0.348-0.303
+						c0.006-0.177-0.13-0.32-0.309-0.34c-1.86-0.175-3.256-1.84-3.138-3.873c0.198-3.26,0.653-6.511,1.406-9.704
+						c0.308-1.31,1.205-2.259,2.312-2.653l-0.963-0.286c-0.965,0.564-1.719,1.532-2.012,2.783
+						C164.943,458.896,164.483,462.185,164.283,465.482z"/>
+					<path fill="#988A38" d="M164.372,465.487c-0.135,2.33,1.436,4.208,3.528,4.452c0.146,0.02,0.275-0.092,0.289-0.232
+						c0.012-0.143-0.096-0.263-0.238-0.282c-1.818-0.245-3.143-1.889-3.026-3.905c0.198-3.262,0.653-6.514,1.407-9.71
+						c0.308-1.309,1.185-2.255,2.276-2.666l-0.783-0.232c-0.977,0.548-1.737,1.512-2.033,2.771
+						C165.031,458.911,164.571,462.195,164.372,465.487z"/>
+					<path fill="#FEE676" d="M164.461,465.492c-0.129,2.229,1.31,3.995,3.274,4.348c0.112,0.022,0.215-0.055,0.231-0.161
+						c0.015-0.107-0.059-0.204-0.166-0.225c-1.771-0.338-3.031-1.938-2.915-3.937c0.198-3.264,0.654-6.519,1.408-9.717
+						c0.308-1.309,1.165-2.251,2.239-2.68l-0.601-0.179c-0.985,0.532-1.754,1.492-2.052,2.761
+						C165.12,458.926,164.66,462.205,164.461,465.492z"/>
+					<path fill="#FEE676" d="M164.551,465.498c-0.123,2.127,1.193,3.757,3.023,4.243c0.076,0.021,0.15-0.02,0.171-0.091
+						c0.02-0.072-0.022-0.146-0.093-0.167c-1.714-0.467-2.918-1.986-2.804-3.968c0.198-3.267,0.654-6.523,1.409-9.724
+						c0.308-1.308,1.144-2.248,2.201-2.693l-0.418-0.124c-0.997,0.515-1.773,1.471-2.073,2.749
+						C165.208,458.941,164.75,462.215,164.551,465.498z"/>
+					<path d="M168.058,453.842c0.34-0.24,0.722-0.414,1.126-0.515l0.003-0.012l-0.689-0.205c-0.129,0.31-0.289,0.569-0.451,0.724
+						L168.058,453.842z"/>
+					<path fill="#988A38" d="M173.802,466.044c-0.145,2.339-2.283,4.342-5.019,4.456c-0.442,0.01-0.787,0.369-0.772,0.8
+						c0.015,0.431,0.397,0.76,0.836,0.734c1.803-0.102,3.395-0.875,4.559-1.939c1.163-1.072,1.905-2.437,1.993-3.956
+						c0.163-2.682,0.537-5.345,1.148-7.935c0.213-0.914,0.166-1.832-0.084-2.724l-1.86-0.554c0.443,0.919,0.62,1.919,0.387,2.913
+						C174.356,460.521,173.97,463.274,173.802,466.044z"/>
+					<path fill="#988A38" d="M173.929,466.051c-0.144,2.315-2.211,4.354-4.908,4.533c-0.394,0.019-0.698,0.336-0.68,0.718
+						c0.017,0.381,0.361,0.673,0.752,0.642c1.733-0.13,3.253-0.902,4.364-1.95c1.108-1.056,1.814-2.388,1.899-3.858
+						c0.164-2.684,0.537-5.349,1.149-7.939c0.213-0.914,0.163-1.835-0.094-2.73l-1.659-0.494c0.427,0.921,0.592,1.913,0.361,2.897
+						C174.482,460.543,174.098,463.288,173.929,466.051z"/>
+					<path fill="#988A38" d="M174.058,466.06c-0.143,2.292-2.14,4.359-4.796,4.608c-0.345,0.026-0.609,0.304-0.588,0.635
+						c0.02,0.333,0.326,0.587,0.668,0.553c1.663-0.162,3.11-0.933,4.166-1.964c1.055-1.039,1.723-2.339,1.805-3.758
+						c0.164-2.687,0.538-5.354,1.15-7.948c0.213-0.914,0.159-1.837-0.102-2.735l-1.458-0.434c0.41,0.92,0.564,1.905,0.336,2.882
+						C174.609,460.564,174.226,463.304,174.058,466.06z"/>
+					<path fill="#988A38" d="M174.187,466.066c-0.142,2.27-2.065,4.359-4.687,4.684c-0.296,0.032-0.517,0.271-0.496,0.555
+						c0.023,0.284,0.292,0.5,0.586,0.465c1.594-0.197,2.969-0.968,3.971-1.981c1-1.021,1.629-2.288,1.709-3.657
+						c0.164-2.688,0.538-5.359,1.152-7.956c0.212-0.913,0.154-1.839-0.111-2.739l-1.256-0.374c0.391,0.92,0.535,1.899,0.31,2.866
+						C174.737,460.587,174.354,463.318,174.187,466.066z"/>
+					<path fill="#988A38" d="M174.315,466.074c-0.141,2.246-1.989,4.352-4.577,4.758c-0.247,0.035-0.429,0.236-0.403,0.475
+						c0.025,0.236,0.258,0.412,0.503,0.375c1.525-0.233,2.832-1.003,3.777-2c0.946-1.003,1.537-2.234,1.614-3.554
+						c0.164-2.69,0.539-5.363,1.153-7.962c0.212-0.914,0.15-1.842-0.121-2.746l-1.055-0.313c0.374,0.92,0.508,1.894,0.284,2.851
+						C174.864,460.608,174.482,463.333,174.315,466.074z"/>
+					<path fill="#988A38" d="M174.442,466.082c-0.139,2.224-1.915,4.328-4.463,4.831c-0.201,0.036-0.342,0.204-0.312,0.394
+						c0.027,0.191,0.219,0.326,0.419,0.288c2.912-0.597,4.941-2.968,5.099-5.469c0.164-2.693,0.539-5.368,1.153-7.97
+						c0.213-0.914,0.147-1.845-0.128-2.751l-0.854-0.254c0.359,0.921,0.479,1.886,0.258,2.835
+						C174.99,460.631,174.609,463.348,174.442,466.082z"/>
+					<path fill="#FEE676" d="M174.571,466.09c-0.137,2.201-1.849,4.279-4.354,4.904c-0.149,0.034-0.251,0.171-0.219,0.314
+						c0.032,0.144,0.184,0.237,0.336,0.199c2.769-0.704,4.655-2.98,4.808-5.385c0.164-2.695,0.54-5.373,1.155-7.978
+						c0.212-0.913,0.143-1.847-0.135-2.755l-0.654-0.194c0.341,0.921,0.451,1.88,0.232,2.819
+						C175.117,460.652,174.737,463.362,174.571,466.09z"/>
+					<path fill="#FEE676" d="M174.699,466.097c-0.133,2.182-1.802,4.177-4.244,4.977c-0.104,0.032-0.161,0.138-0.125,0.236
+						c0.035,0.097,0.147,0.148,0.252,0.115c2.606-0.87,4.373-2.995,4.517-5.304c0.165-2.698,0.54-5.379,1.156-7.985
+						c0.212-0.913,0.14-1.848-0.144-2.76l-0.453-0.135c0.325,0.92,0.423,1.873,0.207,2.804
+						C175.243,460.675,174.865,463.377,174.699,466.097z"/>
+					<path d="M173.431,457.474c-0.655,2.774-1.054,5.617-1.228,8.476c-0.099,1.641-1.593,3.227-3.438,3.255
+						c2.126-0.028,3.831-1.356,3.937-3.225c0.172-2.831,0.567-5.646,1.215-8.392c0.283-1.199-0.179-2.334-1.081-3.187l-0.348-0.104
+						C173.293,455.169,173.687,456.382,173.431,457.474z"/>
+					<path d="M166.943,468.702c-0.36-0.212-0.667-0.487-0.927-0.814l-0.013,0.005c0.46,0.818,0.375,3.491-1.208,3.146
+						c0.666,0.436,1.417,0.728,2.222,0.89C166.964,470.854,166.936,469.778,166.943,468.702z"/>
+					<path d="M162.022,465.349c-0.118,2.039,0.647,3.842,1.917,5.036c1.27,1.205,3.026,1.805,4.911,1.697
+						c-1.747,0.101-3.382-0.574-4.559-1.812c-1.178-1.229-1.879-3.018-1.77-4.892c0.206-3.395,0.68-6.783,1.466-10.118
+						c0.27-1.157,0.843-2.197,1.605-3.014l-0.481-0.143c-0.755,0.801-1.329,1.836-1.61,3.042
+						C162.708,458.51,162.229,461.926,162.022,465.349z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M172.461,483.169l3.883-1.072c-1.537-1.401-3.932-2.215-6.407-1.713
+						c0.193,0.94,0.419,1.873,0.666,2.801C171.239,483.015,171.875,483.033,172.461,483.169z"/>
+					<path d="M170.603,483.185c0.572-0.152,1.151-0.115,1.689,0.031l0.35-0.097C172.005,482.993,171.308,482.997,170.603,483.185z"
+						/>
+					<path fill="#988A38" d="M170.603,483.185c-0.246-0.928-0.472-1.86-0.666-2.801c-2.468,0.5-4.345,2.36-4.825,4.814l3.883-1.072
+						C169.425,483.688,169.966,483.354,170.603,483.185z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M169.947,480.43c-0.431,0.09-0.713,0.506-0.618,0.927c0.097,0.419,0.524,0.676,0.953,0.571
+								c1.513-0.359,2.996-0.014,4.135,0.701l1.871-0.517C174.759,480.732,172.39,479.933,169.947,480.43z"/>
+							<path fill="#988A38" d="M170.209,480.451c-0.387,0.073-0.642,0.444-0.561,0.816c0.081,0.373,0.458,0.6,0.841,0.516
+								c1.5-0.315,2.962,0.078,4.079,0.804l1.669-0.461C174.775,480.811,172.531,480.02,170.209,480.451z"/>
+							<path fill="#988A38" d="M170.472,480.474c-0.34,0.057-0.569,0.38-0.504,0.707c0.066,0.326,0.394,0.525,0.731,0.462
+								c1.485-0.271,2.927,0.166,4.021,0.902l1.466-0.404C174.79,480.891,172.673,480.109,170.472,480.474z"/>
+							<path fill="#988A38" d="M170.733,480.493c-0.296,0.043-0.499,0.318-0.448,0.601c0.051,0.28,0.33,0.452,0.624,0.406
+								c1.475-0.224,2.887,0.257,3.963,1.003l1.264-0.349C174.806,480.969,172.816,480.199,170.733,480.493z"/>
+							<path fill="#988A38" d="M170.993,480.514c-0.247,0.028-0.425,0.257-0.388,0.492c0.036,0.235,0.264,0.384,0.512,0.354
+								c1.47-0.173,2.852,0.345,3.908,1.102l1.061-0.293C174.822,481.049,172.964,480.294,170.993,480.514z"/>
+							<path fill="#988A38" d="M171.254,480.533c-0.201,0.016-0.353,0.195-0.331,0.387c0.023,0.189,0.202,0.315,0.404,0.299
+								c1.463-0.114,2.815,0.434,3.849,1.2l0.858-0.236C174.834,481.13,173.115,480.394,171.254,480.533z"/>
+							<path fill="#FEE676" d="M171.516,480.552c-0.155,0.004-0.281,0.134-0.274,0.28c0.008,0.146,0.139,0.252,0.295,0.247
+								c1.457-0.041,2.778,0.521,3.791,1.298l0.658-0.182C174.846,481.21,173.264,480.508,171.516,480.552z"/>
+							<path fill="#FEE676" d="M171.777,480.569c-0.11-0.006-0.208,0.073-0.215,0.176c-0.008,0.104,0.075,0.19,0.184,0.195
+								c1.447,0.065,2.742,0.606,3.733,1.396l0.455-0.126C174.861,481.29,173.412,480.652,171.777,480.569z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M170.514,483.126c0.317-0.084,0.507-0.396,0.433-0.699c-0.075-0.301-0.396-0.484-0.715-0.408
+								c-1.458,0.349-2.597,1.333-3.166,2.64l1.695-0.468C169.217,483.688,169.812,483.312,170.514,483.126z"/>
+							<path fill="#988A38" d="M170.361,483.138c0.282-0.081,0.448-0.356,0.381-0.628c-0.071-0.268-0.359-0.43-0.642-0.357
+								c-1.342,0.347-2.389,1.271-2.926,2.477l1.514-0.418C169.123,483.715,169.689,483.331,170.361,483.138z"/>
+							<path fill="#988A38" d="M170.212,483.148c0.244-0.075,0.387-0.317,0.323-0.555c-0.065-0.234-0.32-0.374-0.567-0.306
+								c-1.228,0.344-2.179,1.207-2.688,2.312l1.333-0.368C169.029,483.743,169.566,483.351,170.212,483.148z"/>
+							<path fill="#988A38" d="M170.06,483.16c0.208-0.071,0.329-0.279,0.269-0.482c-0.06-0.201-0.281-0.318-0.491-0.254
+								c-1.112,0.342-1.972,1.143-2.45,2.146l1.152-0.317C168.934,483.771,169.439,483.371,170.06,483.16z"/>
+							<path fill="#988A38" d="M169.911,483.17c0.169-0.064,0.265-0.238,0.211-0.409c-0.052-0.167-0.243-0.261-0.415-0.201
+								c-0.997,0.341-1.766,1.078-2.212,1.98l0.968-0.268C168.839,483.8,169.316,483.395,169.911,483.17z"/>
+							<path fill="#988A38" d="M169.758,483.182c0.136-0.058,0.208-0.2,0.158-0.335c-0.048-0.133-0.203-0.204-0.34-0.15
+								c-0.881,0.343-1.557,1.011-1.975,1.814l0.788-0.218C168.745,483.828,169.194,483.419,169.758,483.182z"/>
+							<path fill="#FEE676" d="M169.605,483.192c0.101-0.05,0.149-0.161,0.105-0.262c-0.043-0.1-0.164-0.145-0.266-0.098
+								c-0.762,0.352-1.347,0.944-1.736,1.647l0.604-0.166C168.649,483.856,169.071,483.45,169.605,483.192z"/>
+							<path fill="#FEE676" d="M169.455,483.203c0.066-0.04,0.085-0.122,0.05-0.187c-0.04-0.065-0.122-0.086-0.19-0.046
+								c-0.635,0.37-1.141,0.876-1.497,1.48l0.42-0.116C168.556,483.884,168.959,483.496,169.455,483.203z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M169.937,480.384c-2.582,0.523-4.564,2.369-5.065,4.88l0.484-0.133C165.82,482.733,167.59,480.858,169.937,480.384z"
+							/>
+					</g>
+					<path d="M168.207,481.006c-0.736,0.364-1.386,0.843-1.916,1.437c1.11-0.577,1.802,0.809,1.987,1.882l0.693-0.191
+						C168.684,483.098,168.433,482.055,168.207,481.006z"/>
+					<path fill="#988A38" d="M170.236,466.862c0.032-0.77-0.598-1.44-1.421-1.499c-0.828-0.059-1.542,0.56-1.576,1.38
+						c-0.229,5.829,0.423,11.69,1.975,17.322l2.892-0.798C170.639,477.946,170.018,472.384,170.236,466.862z"/>
+					<path d="M167.488,466.753c0.028-0.681,0.636-1.438,1.327-1.39c-0.964-0.067-1.785,0.41-1.826,1.37
+						c-0.23,5.854,0.424,11.74,1.983,17.399l0.482-0.134C167.909,478.393,167.259,472.557,167.488,466.753z"/>
+					<g>
+						<path fill="#988A38" d="M170.044,466.673c0.021-0.412-0.319-0.769-0.761-0.795c-0.441-0.026-0.816,0.299-0.838,0.726
+							c-0.245,5.768,0.391,11.574,1.925,17.143l1.543-0.426C170.425,477.92,169.806,472.274,170.044,466.673z"/>
+						<path fill="#988A38" d="M169.969,467.099c0.01-0.37-0.292-0.685-0.686-0.704c-0.393-0.021-0.73,0.272-0.74,0.653
+							c-0.199,5.611,0.446,11.254,1.937,16.668l1.376-0.38C170.404,478.067,169.775,472.565,169.969,467.099z"/>
+						<path fill="#988A38" d="M169.896,467.525c0.011-0.325-0.262-0.602-0.609-0.615c-0.347-0.015-0.636,0.246-0.647,0.58
+							c-0.15,5.455,0.5,10.934,1.947,16.196l1.212-0.335C170.386,478.215,169.749,472.856,169.896,467.525z"/>
+						<path fill="#988A38" d="M169.828,467.954c0.003-0.282-0.231-0.519-0.531-0.527c-0.301-0.01-0.552,0.217-0.555,0.506
+							c-0.102,5.298,0.546,10.614,1.953,15.724l1.047-0.289C170.364,478.363,169.729,473.148,169.828,467.954z"/>
+						<path fill="#988A38" d="M169.763,468.382c0.004-0.237-0.199-0.435-0.451-0.44c-0.253-0.005-0.461,0.188-0.465,0.43
+							c-0.059,5.143,0.589,10.297,1.956,15.256l0.883-0.244C170.343,478.511,169.705,473.439,169.763,468.382z"/>
+						<path fill="#FEE676" d="M169.698,468.812c-0.001-0.192-0.162-0.352-0.369-0.354c-0.206-0.003-0.375,0.155-0.374,0.351
+							c-0.019,4.987,0.629,9.98,1.956,14.788l0.716-0.197C170.319,478.66,169.679,473.732,169.698,468.812z"/>
+						<path fill="#FEE676" d="M169.641,469.243c0-0.149-0.128-0.271-0.286-0.271c-0.156,0-0.284,0.123-0.284,0.273
+							c0.023,4.832,0.665,9.665,1.951,14.321l0.55-0.151C170.298,478.809,169.663,474.024,169.641,469.243z"/>
+						<path fill="#FEE676" d="M169.583,469.675c-0.002-0.104-0.091-0.188-0.202-0.188s-0.201,0.087-0.198,0.192
+							c0.059,4.677,0.702,9.35,1.944,13.857l0.386-0.106C170.28,478.957,169.642,474.317,169.583,469.675z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M171.659,486.877c0.778-0.274,1.208-1.087,0.975-1.819c-0.186-0.594-0.363-1.19-0.529-1.79
+						l-2.892,0.799c0.176,0.635,0.363,1.267,0.561,1.896C170.023,486.746,170.876,487.153,171.659,486.877z"/>
+					<g>
+						<path fill="#988A38" d="M171.998,486.329c0.418-0.144,0.653-0.577,0.525-0.969c-0.218-0.675-0.42-1.355-0.61-2.04
+							l-1.543,0.427c0.195,0.705,0.404,1.407,0.628,2.104C171.132,486.258,171.579,486.472,171.998,486.329z"/>
+						<path fill="#988A38" d="M171.926,486.025c0.374-0.125,0.583-0.511,0.47-0.862c-0.19-0.605-0.371-1.214-0.541-1.826
+							l-1.376,0.38c0.174,0.629,0.36,1.255,0.556,1.877C171.152,485.957,171.552,486.15,171.926,486.025z"/>
+						<path fill="#988A38" d="M171.854,485.722c0.33-0.107,0.513-0.445,0.419-0.758c-0.168-0.534-0.324-1.072-0.475-1.611
+							l-1.212,0.334c0.155,0.553,0.314,1.104,0.486,1.651C171.169,485.66,171.523,485.83,171.854,485.722z"/>
+						<path fill="#988A38" d="M171.782,485.42c0.287-0.092,0.451-0.386,0.368-0.656c-0.146-0.462-0.275-0.93-0.408-1.396
+							l-1.047,0.289c0.135,0.476,0.268,0.953,0.417,1.426C171.197,485.359,171.496,485.512,171.782,485.42z"/>
+						<path fill="#988A38" d="M171.714,485.115c0.243-0.076,0.384-0.323,0.313-0.551l-0.342-1.181l-0.883,0.243l0.348,1.203
+							C171.223,485.062,171.473,485.19,171.714,485.115z"/>
+						<path fill="#FEE676" d="M171.65,484.811c0.197-0.061,0.31-0.259,0.254-0.446l-0.276-0.965l-0.717,0.198l0.28,0.979
+							C171.248,484.767,171.452,484.871,171.65,484.811z"/>
+						<path fill="#FEE676" d="M171.583,484.508c0.155-0.047,0.244-0.201,0.202-0.346l-0.213-0.747l-0.55,0.151l0.215,0.756
+							C171.279,484.469,171.429,484.554,171.583,484.508z"/>
+						<path fill="#FEE676" d="M171.522,484.202c0.108-0.031,0.171-0.14,0.142-0.242l-0.15-0.529l-0.386,0.106l0.152,0.533
+							C171.31,484.174,171.414,484.234,171.522,484.202z"/>
+					</g>
+					<path d="M171.659,486.877c-0.653,0.231-1.441-0.341-1.647-0.99c-0.196-0.626-0.383-1.255-0.558-1.887l-0.482,0.133
+						c0.177,0.638,0.365,1.272,0.563,1.904C169.829,486.955,170.748,487.198,171.659,486.877z"/>
+					<path fill="#988A38" d="M169.597,498.1c0.502,0.87,1.156,1.599,1.913,2.141c0.372,0.274,0.775,0.499,1.184,0.68
+						c0.403,0.174,0.879,0.374,1.322,0.466c1.815,0.418,3.568-0.15,4.908-1.375c1.348-1.23,2.119-2.82,2.559-4.226
+						c0.1-0.365,0.211-0.671,0.238-1.056c0.032-0.378,0.025-0.744-0.008-1.109c-0.077-0.724-0.279-1.423-0.651-2.065
+						c-1.286-2.253-2.356-4.67-3.21-7.183c-0.299-0.887-0.822-1.646-1.507-2.274l-3.883,1.071c1.19,0.276,2.169,1.051,2.549,2.166
+						c0.912,2.683,2.058,5.274,3.446,7.707c0.851,1.495,0.325,3.567-1.298,4.712c-1.621,1.144-3.92,0.671-4.956-1.142
+						c-1.634-2.863-2.961-5.875-4.012-8.966c-0.441-1.295-0.064-2.643,0.804-3.521l-3.883,1.072
+						c-0.211,1.08-0.152,2.259,0.237,3.411C166.457,491.871,167.861,495.057,169.597,498.1z"/>
+					<path fill="#988A38" d="M171.035,497.278c1.421,2.504,4.667,3.188,6.773,1.505c0.259-0.2,0.321-0.56,0.144-0.805
+						c-0.177-0.245-0.54-0.306-0.81-0.122c-1.654,1.173-4.011,0.691-5.073-1.169c-1.639-2.872-2.97-5.893-4.024-8.992
+						c-0.434-1.276-0.098-2.608,0.716-3.505l-1.695,0.468c-0.447,1.023-0.549,2.238-0.149,3.42
+						C167.995,491.245,169.356,494.334,171.035,497.278z"/>
+					<path fill="#988A38" d="M171.112,497.233c1.375,2.415,4.451,3.118,6.509,1.567c0.236-0.174,0.291-0.494,0.135-0.72
+						c-0.158-0.222-0.479-0.275-0.723-0.116c-1.642,1.107-3.945,0.588-4.999-1.259c-1.64-2.874-2.972-5.896-4.026-8.998
+						c-0.434-1.276-0.114-2.596,0.679-3.497l-1.514,0.418c-0.464,1.014-0.575,2.23-0.171,3.42
+						C168.077,491.211,169.436,494.295,171.112,497.233z"/>
+					<path fill="#988A38" d="M171.19,497.189c1.327,2.329,4.234,3.038,6.242,1.628c0.212-0.146,0.261-0.429,0.126-0.63
+						c-0.135-0.2-0.415-0.249-0.629-0.116c-1.636,1.043-3.881,0.484-4.925-1.347c-1.641-2.877-2.975-5.901-4.029-9.005
+						c-0.434-1.275-0.132-2.581,0.638-3.488l-1.333,0.368c-0.48,1.006-0.6,2.222-0.194,3.421
+						C168.159,491.178,169.517,494.257,171.19,497.189z"/>
+					<path fill="#988A38" d="M171.268,497.146c1.279,2.242,4.014,2.955,5.975,1.687c0.184-0.12,0.231-0.364,0.119-0.541
+						c-0.115-0.177-0.354-0.222-0.542-0.111c-1.629,0.977-3.814,0.381-4.85-1.436c-1.642-2.879-2.977-5.906-4.032-9.012
+						c-0.434-1.275-0.145-2.567,0.601-3.48l-1.152,0.318c-0.497,0.996-0.625,2.213-0.215,3.422
+						C168.243,491.144,169.598,494.218,171.268,497.146z"/>
+					<path fill="#988A38" d="M171.347,497.101c1.233,2.158,3.79,2.867,5.707,1.744c0.158-0.093,0.202-0.3,0.111-0.451
+						c-0.095-0.15-0.292-0.192-0.449-0.108c-1.632,0.905-3.749,0.279-4.776-1.522c-1.644-2.882-2.979-5.911-4.035-9.019
+						c-0.433-1.274-0.163-2.554,0.56-3.471l-0.968,0.268c-0.514,0.986-0.65,2.204-0.237,3.422
+						C168.327,491.109,169.68,494.178,171.347,497.101z"/>
+					<path fill="#988A38" d="M171.424,497.057c1.184,2.07,3.565,2.773,5.438,1.799c0.131-0.067,0.173-0.234,0.104-0.36
+						c-0.072-0.126-0.231-0.165-0.359-0.102c-1.637,0.822-3.684,0.174-4.702-1.612c-1.645-2.884-2.981-5.915-4.039-9.025
+						c-0.433-1.273-0.178-2.539,0.522-3.462l-0.787,0.217c-0.532,0.979-0.676,2.197-0.26,3.424
+						C168.409,491.076,169.76,494.14,171.424,497.057z"/>
+					<path fill="#FEE676" d="M171.501,497.012c1.132,1.979,3.339,2.663,5.169,1.854c0.104-0.046,0.145-0.17,0.097-0.269
+						c-0.049-0.1-0.167-0.139-0.268-0.096c-1.65,0.719-3.618,0.069-4.628-1.701c-1.646-2.886-2.984-5.919-4.042-9.032
+						c-0.432-1.273-0.193-2.524,0.484-3.454l-0.604,0.167c-0.548,0.97-0.702,2.188-0.283,3.424
+						C168.492,491.042,169.84,494.101,171.501,497.012z"/>
+					<path fill="#FEE676" d="M171.58,496.968c1.08,1.887,3.107,2.527,4.9,1.906c0.075-0.025,0.112-0.103,0.088-0.175
+						c-0.025-0.073-0.105-0.111-0.174-0.089c-1.677,0.578-3.553-0.036-4.555-1.791c-1.647-2.888-2.986-5.924-4.044-9.038
+						c-0.432-1.273-0.209-2.512,0.444-3.446l-0.42,0.116c-0.567,0.961-0.729,2.181-0.307,3.426
+						C168.576,491.008,169.921,494.062,171.58,496.968z"/>
+					<path d="M168.293,485.176c0.167-0.385,0.39-0.732,0.682-1.031l-0.003-0.012l-0.693,0.191c0.054,0.329,0.061,0.633,0.001,0.85
+						L168.293,485.176z"/>
+					<path fill="#988A38" d="M179.627,492.373c0.574,1.009,0.666,2.177,0.37,3.334c-0.317,1.136-1.02,2.24-2.085,3.094
+						c-0.351,0.267-0.418,0.77-0.135,1.109c0.283,0.339,0.789,0.363,1.115,0.065c1.338-1.216,2.105-2.794,2.543-4.189
+						c0.239-0.67,0.302-1.429,0.227-2.155c-0.076-0.719-0.277-1.413-0.646-2.051c-1.288-2.256-2.358-4.676-3.213-7.191
+						c-0.3-0.889-0.824-1.647-1.515-2.276l-1.871,0.517c0.87,0.547,1.544,1.308,1.871,2.273
+						C177.174,487.508,178.286,490.021,179.627,492.373z"/>
+					<path fill="#988A38" d="M179.738,492.309c0.567,0.998,0.675,2.159,0.405,3.313c-0.293,1.132-0.96,2.233-1.989,3.105
+						c-0.308,0.247-0.365,0.688-0.113,0.982c0.252,0.294,0.7,0.312,0.988,0.04c1.268-1.189,1.991-2.714,2.4-4.063
+						c0.228-0.644,0.258-1.404,0.187-2.1c-0.08-0.697-0.28-1.369-0.636-1.986c-1.289-2.259-2.361-4.682-3.217-7.198
+						c-0.299-0.889-0.828-1.648-1.526-2.276l-1.669,0.461c0.856,0.557,1.517,1.316,1.841,2.274
+						C177.292,487.459,178.401,489.965,179.738,492.309z"/>
+					<path fill="#988A38" d="M179.851,492.246c0.561,0.985,0.685,2.139,0.44,3.29c-0.271,1.125-0.903,2.223-1.895,3.115
+						c-0.264,0.227-0.311,0.606-0.091,0.857c0.223,0.249,0.615,0.262,0.863,0.019c1.196-1.166,1.876-2.638,2.253-3.938
+						c0.216-0.618,0.215-1.384,0.147-2.045c-0.084-0.675-0.282-1.325-0.626-1.922c-1.291-2.262-2.364-4.686-3.22-7.206
+						c-0.299-0.888-0.832-1.647-1.536-2.275l-1.466,0.405c0.842,0.564,1.491,1.325,1.811,2.273
+						C177.412,487.411,178.517,489.909,179.851,492.246z"/>
+					<path fill="#988A38" d="M179.962,492.182c0.555,0.975,0.693,2.118,0.474,3.266c-0.25,1.118-0.847,2.212-1.802,3.128
+						c-0.222,0.202-0.255,0.522-0.067,0.733c0.196,0.205,0.532,0.211,0.742-0.002c1.126-1.148,1.76-2.563,2.108-3.818
+						c0.325-1.302,0.133-2.709-0.511-3.845c-1.292-2.264-2.366-4.691-3.224-7.214c-0.3-0.888-0.837-1.647-1.545-2.275l-1.263,0.35
+						c0.825,0.574,1.463,1.335,1.78,2.274C177.532,487.362,178.634,489.853,179.962,492.182z"/>
+					<path fill="#988A38" d="M180.074,492.118c0.548,0.964,0.702,2.096,0.507,3.237c-0.228,1.11-0.793,2.199-1.71,3.141
+						c-0.177,0.175-0.203,0.442-0.042,0.612c0.166,0.165,0.45,0.159,0.621-0.022c1.055-1.131,1.642-2.492,1.962-3.7
+						c0.273-1.28,0.081-2.622-0.543-3.722c-1.293-2.267-2.369-4.696-3.227-7.221c-0.299-0.888-0.842-1.647-1.557-2.274l-1.061,0.293
+						c0.811,0.584,1.437,1.345,1.75,2.275C177.651,487.313,178.75,489.797,180.074,492.118z"/>
+					<path fill="#988A38" d="M180.185,492.055c0.542,0.953,0.708,2.07,0.537,3.205c-0.205,1.102-0.742,2.185-1.611,3.153
+						c-0.139,0.148-0.154,0.364-0.018,0.491c0.138,0.128,0.366,0.115,0.501-0.039c0.976-1.117,1.52-2.426,1.81-3.585
+						c0.222-1.26,0.03-2.531-0.573-3.594c-1.295-2.27-2.372-4.702-3.23-7.229c-0.3-0.888-0.846-1.647-1.566-2.274l-0.859,0.237
+						c0.799,0.593,1.409,1.354,1.72,2.275C177.77,487.266,178.865,489.741,180.185,492.055z"/>
+					<path fill="#FEE676" d="M180.297,491.991c0.536,0.941,0.709,2.041,0.559,3.167c-0.184,1.094-0.696,2.168-1.508,3.173
+						c-0.099,0.117-0.102,0.283,0.008,0.372c0.112,0.088,0.285,0.066,0.383-0.057c0.887-1.109,1.394-2.366,1.65-3.477
+						c0.172-1.238-0.013-2.435-0.595-3.462c-1.296-2.272-2.374-4.707-3.234-7.237c-0.3-0.887-0.851-1.647-1.574-2.274l-0.658,0.182
+						c0.783,0.604,1.381,1.363,1.689,2.276C177.889,487.217,178.981,489.685,180.297,491.991z"/>
+					<path fill="#FEE676" d="M180.408,491.928c0.53,0.93,0.701,2.003,0.564,3.117c-0.166,1.087-0.658,2.146-1.391,3.202
+						c-0.063,0.087-0.05,0.203,0.037,0.254c0.085,0.051,0.204,0.02,0.267-0.07c0.774-1.129,1.254-2.302,1.466-3.398
+						c0.144-1.184-0.035-2.319-0.595-3.304c-1.298-2.274-2.376-4.711-3.237-7.244c-0.299-0.888-0.854-1.647-1.584-2.274
+						l-0.455,0.126c0.769,0.612,1.355,1.374,1.661,2.277C178.008,487.168,179.097,489.629,180.408,491.928z"/>
+					<path d="M174.772,485.416c0.917,2.696,2.068,5.303,3.465,7.751c0.793,1.401,0.428,3.521-1.08,4.587
+						c1.74-1.225,2.421-3.248,1.514-4.835c-1.38-2.418-2.519-4.995-3.426-7.664c-0.397-1.165-1.378-1.893-2.603-2.136l-0.35,0.097
+						C173.443,483.527,174.41,484.354,174.772,485.416z"/>
+					<path d="M175.366,498.355c-0.419,0.023-0.827-0.037-1.229-0.168l-0.008,0.011c0.438,0.224,0.972,0.882,1.288,1.619
+						c0.313,0.729,0.385,1.487-0.375,1.81c0.828-0.012,1.618-0.229,2.353-0.595C176.656,500.201,175.946,499.245,175.366,498.355z"
+						/>
+					<path d="M169.379,498.224c0.523,0.905,1.196,1.65,1.97,2.193c0.38,0.275,0.794,0.496,1.209,0.672
+						c0.421,0.181,0.899,0.364,1.352,0.449c1.847,0.377,3.623-0.259,5.013-1.527c-1.286,1.176-3.018,1.677-4.803,1.217
+						c-0.908-0.259-1.722-0.629-2.454-1.168c-0.738-0.541-1.374-1.252-1.854-2.084c-1.727-3.028-3.124-6.199-4.228-9.446
+						c-0.381-1.127-0.44-2.31-0.229-3.398l-0.484,0.134c-0.212,1.073-0.156,2.251,0.24,3.425
+						C166.225,491.965,167.635,495.166,169.379,498.224z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M189.214,504.887l1.542-3.722c-1.912,0.412-4.013,1.738-5.254,3.938c0.84,0.46,1.704,0.877,2.583,1.258
+						C188.346,505.757,188.753,505.268,189.214,504.887z"/>
+					<path d="M188.084,506.361c0.235-0.543,0.622-0.976,1.062-1.312l0.139-0.336C188.802,505.141,188.374,505.691,188.084,506.361z"
+						/>
+					<path fill="#988A38" d="M188.084,506.361c-0.879-0.381-1.743-0.798-2.583-1.258c-1.238,2.189-0.981,4.882,0.795,6.826
+						l1.542-3.721C187.751,507.598,187.823,506.966,188.084,506.361z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M185.543,505.126c-0.213,0.387-0.075,0.871,0.313,1.07c0.387,0.197,0.86,0.027,1.05-0.372
+								c0.679-1.395,1.87-2.335,3.084-2.812l0.743-1.793C188.84,501.636,186.764,502.952,185.543,505.126z"/>
+							<path fill="#988A38" d="M185.729,504.94c-0.197,0.343-0.076,0.774,0.263,0.953c0.34,0.18,0.756,0.031,0.934-0.32
+								c0.708-1.358,1.917-2.252,3.125-2.706l0.663-1.6C188.904,501.667,186.917,502.899,185.729,504.94z"/>
+							<path fill="#988A38" d="M185.916,504.754c-0.179,0.298-0.078,0.678,0.214,0.839c0.293,0.159,0.655,0.032,0.82-0.27
+								c0.735-1.321,1.96-2.169,3.161-2.603l0.582-1.404C188.966,501.702,187.073,502.844,185.916,504.754z"/>
+							<path fill="#988A38" d="M186.099,504.566c-0.16,0.256-0.079,0.583,0.168,0.727c0.247,0.142,0.556,0.033,0.707-0.223
+								c0.766-1.282,2.002-2.083,3.197-2.495l0.501-1.211C189.029,501.735,187.23,502.788,186.099,504.566z"/>
+							<path fill="#988A38" d="M186.282,504.382c-0.138,0.208-0.079,0.485,0.123,0.609c0.202,0.124,0.46,0.04,0.594-0.169
+								c0.802-1.244,2.045-2.003,3.233-2.393l0.421-1.017C189.094,501.769,187.393,502.733,186.282,504.382z"/>
+							<path fill="#988A38" d="M186.464,504.193c-0.118,0.164-0.079,0.392,0.079,0.499c0.157,0.104,0.368,0.044,0.482-0.122
+								c0.842-1.199,2.086-1.919,3.267-2.286l0.341-0.822C189.158,501.803,187.559,502.682,186.464,504.193z"/>
+							<path fill="#FEE676" d="M186.645,504.003c-0.096,0.122-0.078,0.299,0.035,0.389c0.114,0.089,0.278,0.053,0.373-0.07
+								c0.893-1.146,2.125-1.839,3.3-2.183l0.261-0.631C189.222,501.837,187.734,502.638,186.645,504.003z"/>
+							<path fill="#FEE676" d="M186.826,503.813c-0.075,0.081-0.078,0.205-0.006,0.276c0.071,0.073,0.189,0.064,0.261-0.018
+								c0.967-1.071,2.164-1.758,3.332-2.078l0.181-0.437C189.288,501.869,187.931,502.614,186.826,503.813z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M187.984,506.396c0.13-0.302,0.006-0.646-0.275-0.774c-0.28-0.127-0.622,0.005-0.764,0.3
+								c-0.655,1.353-0.596,2.863,0.127,4.136l0.673-1.624C187.622,507.761,187.694,507.061,187.984,506.396z"/>
+							<path fill="#988A38" d="M187.898,506.521c0.111-0.272,0-0.575-0.253-0.688c-0.251-0.11-0.557,0.012-0.679,0.275
+								c-0.581,1.261-0.508,2.663,0.149,3.846l0.601-1.451C187.585,507.852,187.633,507.17,187.898,506.521z"/>
+							<path fill="#988A38" d="M187.815,506.645c0.092-0.237-0.008-0.502-0.233-0.598c-0.223-0.095-0.492,0.016-0.594,0.251
+								c-0.507,1.168-0.422,2.46,0.169,3.554l0.529-1.276C187.549,507.943,187.572,507.278,187.815,506.645z"/>
+							<path fill="#988A38" d="M187.729,506.771c0.074-0.207-0.015-0.434-0.211-0.511c-0.194-0.079-0.424,0.02-0.506,0.224
+								c-0.433,1.078-0.34,2.26,0.188,3.265l0.457-1.104C187.513,508.035,187.509,507.391,187.729,506.771z"/>
+							<path fill="#988A38" d="M187.645,506.895c0.054-0.174-0.022-0.359-0.19-0.42c-0.163-0.064-0.356,0.024-0.418,0.196
+								c-0.359,0.988-0.256,2.06,0.206,2.976l0.384-0.928C187.477,508.128,187.451,507.501,187.645,506.895z"/>
+							<path fill="#988A38" d="M187.559,507.021c0.039-0.143-0.028-0.289-0.166-0.332c-0.135-0.045-0.288,0.029-0.332,0.171
+								c-0.284,0.899-0.175,1.853,0.224,2.685l0.312-0.755C187.441,508.219,187.394,507.612,187.559,507.021z"/>
+							<path fill="#FEE676" d="M187.472,507.146c0.023-0.109-0.034-0.219-0.142-0.246c-0.106-0.027-0.217,0.038-0.244,0.146
+								c-0.2,0.813-0.093,1.648,0.241,2.395l0.24-0.58C187.404,508.312,187.342,507.728,187.472,507.146z"/>
+							<path fill="#FEE676" d="M187.386,507.271c0.01-0.076-0.042-0.144-0.116-0.155c-0.077-0.009-0.145,0.043-0.156,0.12
+								c-0.102,0.724-0.016,1.446,0.257,2.102l0.167-0.403C187.368,508.403,187.309,507.845,187.386,507.271z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M185.501,505.104c-1.293,2.29-1.117,5.057,0.7,7.057l0.192-0.464C184.66,509.803,184.323,507.186,185.501,505.104z"/>
+					</g>
+					<path d="M184.857,506.825c-0.192,0.798-0.235,1.611-0.093,2.408c0.252-1.224,1.807-0.86,2.789-0.338l0.275-0.664
+						C186.819,507.812,185.826,507.346,184.857,506.825z"/>
+					<path fill="#988A38" d="M177.085,494.999c-0.392-0.655-1.284-0.871-2.007-0.473c-0.727,0.401-0.987,1.312-0.562,2.021
+						c0.771,1.249,1.548,2.484,2.527,3.684c0.962,1.18,2.044,2.249,3.203,3.211c2.319,1.927,4.941,3.419,7.679,4.558l1.148-2.772
+						c-2.5-1.038-4.863-2.391-6.91-4.093c-1.023-0.85-1.967-1.784-2.796-2.802C178.548,497.338,177.803,496.165,177.085,494.999z"/>
+					<path d="M174.729,496.419c-0.354-0.588-0.258-1.559,0.349-1.893c-0.844,0.468-1.274,1.318-0.777,2.151
+						c0.775,1.255,1.555,2.496,2.548,3.712c0.974,1.193,2.066,2.274,3.236,3.246c2.342,1.945,4.985,3.449,7.743,4.596l0.191-0.462
+						c-2.718-1.13-5.318-2.611-7.615-4.519c-1.147-0.953-2.217-2.012-3.168-3.178C176.271,498.892,175.496,497.661,174.729,496.419z
+						"/>
+					<g>
+						<path fill="#988A38" d="M176.822,494.946c-0.208-0.354-0.688-0.463-1.072-0.244c-0.384,0.219-0.52,0.696-0.303,1.064
+							c0.748,1.237,1.523,2.455,2.435,3.596c0.914,1.132,1.943,2.167,3.058,3.097c2.228,1.866,4.772,3.322,7.444,4.433l0.612-1.479
+							c-2.542-1.057-4.948-2.437-7.029-4.181c-1.042-0.869-1.996-1.831-2.838-2.874C178.305,497.33,177.538,496.133,176.822,494.946
+							z"/>
+						<path fill="#988A38" d="M176.991,495.341c-0.195-0.311-0.616-0.409-0.958-0.209c-0.338,0.198-0.463,0.63-0.26,0.953
+							c0.74,1.191,1.502,2.378,2.407,3.462c0.902,1.084,1.915,2.071,3.001,2.967c2.179,1.787,4.649,3.195,7.245,4.272l0.546-1.318
+							c-2.483-1.03-4.833-2.373-6.885-4.057c-1.023-0.843-1.972-1.769-2.811-2.777C178.447,497.643,177.711,496.5,176.991,495.341z"
+							/>
+						<path fill="#988A38" d="M177.165,495.734c-0.169-0.275-0.549-0.354-0.847-0.174c-0.297,0.179-0.395,0.556-0.22,0.84
+							c1.404,2.339,3.179,4.47,5.325,6.165c2.127,1.715,4.526,3.073,7.046,4.118l0.481-1.162
+							C184.087,503.516,179.788,500.299,177.165,495.734z"/>
+						<path fill="#988A38" d="M177.344,496.126c-0.154-0.233-0.477-0.301-0.732-0.143c-0.255,0.158-0.341,0.486-0.183,0.729
+							c2.725,4.579,7.208,7.864,12.083,9.867l0.416-1.004C184.197,503.62,179.958,500.516,177.344,496.126z"/>
+						<path fill="#988A38" d="M177.527,496.517c-0.128-0.198-0.407-0.249-0.619-0.113c-0.213,0.136-0.279,0.411-0.147,0.614
+							c2.71,4.384,7.075,7.508,11.795,9.458l0.351-0.846C184.307,503.723,180.132,500.731,177.527,496.517z"/>
+						<path fill="#FEE676" d="M177.711,496.906c-0.108-0.158-0.33-0.199-0.503-0.086c-0.172,0.112-0.225,0.338-0.114,0.499
+							c2.698,4.18,6.934,7.16,11.504,9.053l0.284-0.687C184.411,503.831,180.315,500.943,177.711,496.906z"/>
+						<path fill="#FEE676" d="M177.904,497.294c-0.083-0.122-0.257-0.151-0.388-0.063c-0.13,0.088-0.167,0.261-0.083,0.386
+							c2.682,3.972,6.788,6.817,11.209,8.65l0.218-0.527C184.515,503.938,180.506,501.15,177.904,497.294z"/>
+						<path fill="#FEE676" d="M178.096,497.677c-0.059-0.083-0.18-0.099-0.271-0.036s-0.116,0.183-0.056,0.267
+							c2.668,3.765,6.641,6.485,10.916,8.257l0.153-0.369C184.615,504.046,180.706,501.36,178.096,497.677z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M191.664,507.743c0.235-0.794-0.163-1.612-0.875-1.862c-0.578-0.202-1.15-0.419-1.715-0.652
+						L187.925,508c0.619,0.256,1.244,0.493,1.875,0.713C190.588,508.989,191.429,508.537,191.664,507.743z"/>
+					<g>
+						<path fill="#988A38" d="M191.422,507.146c0.132-0.423-0.077-0.862-0.461-0.994c-0.663-0.226-1.318-0.472-1.964-0.739
+							l-0.612,1.479c0.678,0.28,1.366,0.538,2.06,0.775C190.851,507.806,191.291,507.566,191.422,507.146z"/>
+						<path fill="#988A38" d="M191.136,507.026c0.121-0.374-0.062-0.772-0.406-0.892c-0.592-0.205-1.178-0.428-1.756-0.667
+							l-0.546,1.318c0.604,0.25,1.215,0.482,1.833,0.697C190.622,507.607,191.015,507.402,191.136,507.026z"/>
+						<path fill="#988A38" d="M190.851,506.903c0.109-0.329-0.049-0.679-0.352-0.787c-0.521-0.186-1.037-0.384-1.548-0.595
+							l-0.481,1.162c0.531,0.219,1.067,0.424,1.607,0.617C190.394,507.414,190.74,507.234,190.851,506.903z"/>
+						<path fill="#988A38" d="M190.568,506.779c0.098-0.284-0.038-0.59-0.3-0.685c-0.45-0.163-0.896-0.338-1.339-0.52l-0.416,1.004
+							c0.458,0.188,0.919,0.368,1.384,0.537C190.169,507.214,190.47,507.063,190.568,506.779z"/>
+						<path fill="#988A38" d="M190.285,506.653c0.085-0.239-0.026-0.498-0.247-0.581c-0.381-0.14-0.757-0.29-1.131-0.442
+							l-0.351,0.846c0.385,0.158,0.772,0.312,1.163,0.456C189.947,507.017,190.2,506.893,190.285,506.653z"/>
+						<path fill="#FEE676" d="M190.004,506.522c0.071-0.193-0.018-0.405-0.199-0.472l-0.922-0.365l-0.285,0.687l0.943,0.374
+							C189.728,506.814,189.934,506.717,190.004,506.522z"/>
+						<path fill="#FEE676" d="M189.725,506.393c0.057-0.15-0.011-0.314-0.15-0.368l-0.714-0.285l-0.218,0.527l0.726,0.29
+							C189.511,506.611,189.668,506.543,189.725,506.393z"/>
+						<path fill="#FEE676" d="M189.447,506.258c0.041-0.105-0.006-0.222-0.104-0.26l-0.504-0.203l-0.153,0.369l0.511,0.206
+							C189.295,506.409,189.407,506.362,189.447,506.258z"/>
+					</g>
+					<path d="M191.664,507.743c-0.195,0.663-1.13,0.964-1.782,0.734c-0.626-0.219-1.247-0.454-1.861-0.708l-0.191,0.462
+						c0.623,0.258,1.253,0.497,1.888,0.719C190.644,509.272,191.389,508.666,191.664,507.743z"/>
+					<path fill="#988A38" d="M199.852,515.811c2.04,0.134,3.907-0.63,5.123-1.885c1.229-1.257,1.806-2.981,1.628-4.793
+						c-0.178-1.814-1.01-3.386-2.069-4.53c-1.071-1.146-2.373-1.87-3.82-1.965c-2.534-0.168-5.046-0.624-7.41-1.37
+						c-0.834-0.263-1.686-0.278-2.548-0.103l-1.542,3.722c0.938-0.771,2.103-1.098,3.187-0.758c2.61,0.823,5.353,1.319,8.117,1.502
+						c1.696,0.113,3.141,1.685,3.214,3.669c0.075,1.981-1.582,3.653-3.684,3.517c-3.316-0.22-6.613-0.812-9.814-1.821
+						c-1.343-0.421-2.23-1.557-2.396-2.786l-1.542,3.722c0.774,0.853,1.817,1.542,3.034,1.926
+						C192.777,514.942,196.306,515.575,199.852,515.811z"/>
+					<path fill="#988A38" d="M199.96,514.157c1.447,0.095,2.781-0.442,3.682-1.34c0.908-0.899,1.384-2.152,1.311-3.492
+						c-0.018-0.327-0.283-0.579-0.592-0.568c-0.309,0.014-0.557,0.283-0.55,0.611c0.081,2.028-1.614,3.741-3.772,3.601
+						c-3.327-0.221-6.637-0.814-9.85-1.828c-1.326-0.415-2.227-1.508-2.442-2.708l-0.673,1.624c0.562,0.996,1.521,1.831,2.757,2.22
+						C193.14,513.32,196.542,513.93,199.96,514.157z"/>
+					<path fill="#988A38" d="M199.965,514.068c1.395,0.091,2.685-0.41,3.567-1.26c0.889-0.85,1.372-2.037,1.33-3.318
+						c-0.01-0.291-0.246-0.519-0.524-0.514c-0.277,0.011-0.496,0.248-0.496,0.541c0.033,1.983-1.665,3.629-3.808,3.49
+						c-3.33-0.222-6.642-0.816-9.858-1.83c-1.325-0.416-2.226-1.488-2.46-2.675l-0.601,1.451c0.543,1.003,1.498,1.847,2.741,2.237
+						C193.16,513.232,196.553,513.842,199.965,514.068z"/>
+					<path fill="#988A38" d="M199.972,513.979c1.343,0.088,2.585-0.379,3.449-1.179c0.869-0.801,1.357-1.921,1.348-3.144
+						c-0.001-0.256-0.209-0.46-0.453-0.456c-0.244,0.003-0.437,0.209-0.442,0.463c-0.018,1.943-1.717,3.519-3.84,3.38
+						c-3.333-0.221-6.648-0.816-9.868-1.831c-1.324-0.416-2.225-1.466-2.479-2.638l-0.529,1.277c0.525,1.01,1.472,1.86,2.726,2.254
+						C193.179,513.146,196.567,513.754,199.972,513.979z"/>
+					<path fill="#988A38" d="M199.978,513.89c2.618,0.174,4.683-1.745,4.698-4.067c0.004-0.22-0.172-0.399-0.382-0.4
+						c-0.211-0.001-0.378,0.174-0.389,0.392c-0.07,1.901-1.768,3.406-3.875,3.269c-3.336-0.221-6.654-0.816-9.876-1.833
+						c-1.325-0.415-2.223-1.446-2.497-2.604l-0.457,1.104c0.505,1.017,1.449,1.874,2.71,2.271
+						C193.199,513.058,196.58,513.664,199.978,513.89z"/>
+					<path fill="#988A38" d="M199.983,513.799c2.511,0.166,4.501-1.604,4.598-3.813c0.011-0.184-0.135-0.336-0.311-0.342
+						c-0.177-0.003-0.319,0.137-0.335,0.315c-0.132,1.866-1.817,3.296-3.908,3.16c-3.339-0.222-6.66-0.817-9.885-1.835
+						c-1.324-0.416-2.223-1.424-2.516-2.565l-0.384,0.928c0.486,1.024,1.424,1.889,2.695,2.288
+						C193.219,512.969,196.592,513.574,199.983,513.799z"/>
+					<path fill="#988A38" d="M199.989,513.711c2.403,0.158,4.31-1.468,4.497-3.562c0.016-0.146-0.098-0.273-0.24-0.284
+						c-0.143-0.009-0.262,0.101-0.279,0.244c-0.204,1.824-1.871,3.184-3.942,3.048c-3.342-0.222-6.666-0.818-9.895-1.836
+						c-1.323-0.416-2.222-1.403-2.533-2.532l-0.313,0.755c0.468,1.032,1.401,1.903,2.679,2.306
+						C193.238,512.882,196.605,513.486,199.989,513.711z"/>
+					<path fill="#FEE676" d="M199.995,513.621c2.296,0.15,4.092-1.334,4.396-3.306c0.019-0.112-0.061-0.214-0.167-0.228
+						c-0.108-0.013-0.204,0.062-0.223,0.17c-0.298,1.779-1.923,3.072-3.977,2.938c-3.345-0.222-6.672-0.819-9.904-1.838
+						c-1.323-0.416-2.22-1.383-2.551-2.496l-0.24,0.579c0.449,1.039,1.376,1.918,2.663,2.323
+						C193.258,512.794,196.618,513.397,199.995,513.621z"/>
+					<path fill="#FEE676" d="M200.001,513.531c2.189,0.142,3.849-1.211,4.293-3.052c0.02-0.077-0.023-0.15-0.096-0.17
+						c-0.073-0.019-0.146,0.025-0.166,0.097c-0.431,1.725-1.976,2.959-4.012,2.828c-3.348-0.223-6.678-0.82-9.913-1.84
+						c-1.323-0.416-2.22-1.362-2.57-2.461l-0.167,0.403c0.43,1.049,1.352,1.935,2.647,2.342
+						C193.277,512.706,196.63,513.309,200.001,513.531z"/>
+					<path d="M188.257,509.412c-0.214-0.365-0.355-0.765-0.417-1.176l-0.011-0.005l-0.275,0.664
+						c0.302,0.161,0.552,0.349,0.693,0.526L188.257,509.412z"/>
+					<path fill="#988A38" d="M200.606,504.284c1.14,0.074,2.159,0.639,2.962,1.52c0.797,0.881,1.379,2.083,1.457,3.447
+						c0.022,0.441,0.393,0.779,0.825,0.749c0.432-0.033,0.748-0.422,0.706-0.862c-0.174-1.798-1-3.359-2.051-4.495
+						c-1.064-1.139-2.357-1.859-3.794-1.954c-2.538-0.167-5.054-0.624-7.422-1.372c-0.835-0.263-1.689-0.278-2.555-0.098
+						l-0.743,1.793c0.932-0.362,1.891-0.462,2.816-0.17C195.307,503.631,197.946,504.108,200.606,504.284z"/>
+					<path fill="#988A38" d="M200.614,504.156c1.126,0.073,2.148,0.621,2.964,1.479c0.809,0.858,1.414,2.031,1.525,3.375
+						c0.029,0.395,0.356,0.689,0.738,0.659c0.381-0.033,0.66-0.382,0.614-0.774c-0.199-1.727-1.017-3.218-2.05-4.302
+						c-1.044-1.085-2.308-1.771-3.698-1.862c-2.541-0.168-5.06-0.626-7.431-1.374c-0.835-0.264-1.693-0.276-2.562-0.089l-0.663,1.6
+						c0.93-0.348,1.879-0.436,2.795-0.147C195.334,503.505,197.963,503.981,200.614,504.156z"/>
+					<path fill="#988A38" d="M200.623,504.028c1.113,0.072,2.136,0.602,2.962,1.437c0.82,0.835,1.447,1.979,1.594,3.304
+						c0.036,0.345,0.321,0.601,0.652,0.568c0.332-0.035,0.575-0.344,0.527-0.688c-0.226-1.656-1.038-3.072-2.053-4.106
+						c-1.025-1.029-2.257-1.682-3.601-1.77c-2.544-0.168-5.067-0.626-7.442-1.376c-0.835-0.263-1.696-0.272-2.57-0.081l-0.582,1.405
+						c0.926-0.331,1.867-0.41,2.773-0.125C195.363,503.379,197.982,503.854,200.623,504.028z"/>
+					<path fill="#988A38" d="M200.631,503.899c1.101,0.071,2.123,0.584,2.959,1.395c0.83,0.812,1.479,1.929,1.664,3.234
+						c0.04,0.295,0.285,0.511,0.567,0.479c0.282-0.038,0.488-0.307,0.441-0.602c-0.257-1.588-1.062-2.932-2.058-3.912
+						c-1.005-0.977-2.207-1.591-3.503-1.677c-2.548-0.168-5.073-0.627-7.452-1.378c-0.836-0.264-1.699-0.269-2.577-0.074
+						l-0.501,1.211c0.922-0.312,1.856-0.384,2.751-0.102C195.391,503.253,198,503.726,200.631,503.899z"/>
+					<path fill="#988A38" d="M200.64,503.771c1.087,0.07,2.108,0.563,2.954,1.352c0.839,0.787,1.509,1.878,1.735,3.165
+						c0.042,0.245,0.249,0.423,0.484,0.388c0.234-0.037,0.401-0.27,0.354-0.515c-0.29-1.519-1.088-2.794-2.065-3.72
+						c-0.985-0.925-2.153-1.5-3.403-1.584c-2.551-0.168-5.08-0.627-7.461-1.38c-0.836-0.263-1.704-0.264-2.584-0.064l-0.421,1.017
+						c0.918-0.296,1.844-0.357,2.73-0.078C195.419,503.127,198.019,503.598,200.64,503.771z"/>
+					<path fill="#988A38" d="M200.648,503.644c2.111,0.149,4.162,1.871,4.753,4.401c0.042,0.198,0.213,0.337,0.4,0.299
+						c0.188-0.037,0.316-0.229,0.271-0.429c-0.331-1.447-1.123-2.654-2.079-3.524c-0.963-0.872-2.094-1.409-3.296-1.489
+						c-2.554-0.169-5.086-0.628-7.472-1.382c-0.837-0.264-1.707-0.262-2.592-0.058l-0.341,0.822
+						c0.917-0.281,1.832-0.331,2.708-0.055C195.447,503.002,198.037,503.471,200.648,503.644z"/>
+					<path fill="#FEE676" d="M200.656,503.516c2.088,0.146,4.108,1.807,4.817,4.289c0.04,0.148,0.178,0.247,0.318,0.208
+						c0.141-0.038,0.229-0.191,0.186-0.343c-0.382-1.372-1.168-2.509-2.101-3.328c-0.938-0.818-2.028-1.321-3.183-1.396
+						c-2.558-0.169-5.093-0.63-7.482-1.384c-0.837-0.264-1.711-0.259-2.598-0.054l-0.261,0.631c0.913-0.264,1.82-0.305,2.687-0.032
+						C195.476,502.876,198.055,503.344,200.656,503.516z"/>
+					<path fill="#FEE676" d="M200.665,503.388c2.068,0.144,4,1.763,4.879,4.177c0.036,0.104,0.142,0.157,0.237,0.117
+						c0.095-0.039,0.142-0.154,0.104-0.258c-0.962-2.568-3.014-4.286-5.195-4.436c-2.561-0.17-5.1-0.63-7.493-1.386
+						c-0.838-0.264-1.713-0.256-2.605-0.045l-0.181,0.437c0.909-0.249,1.808-0.28,2.665-0.01
+						C195.503,502.75,198.073,503.216,200.665,503.388z"/>
+					<path d="M192.325,504.368c2.631,0.83,5.393,1.33,8.176,1.514c1.589,0.11,3.16,1.573,3.23,3.418
+						c-0.076-2.126-1.396-3.801-3.198-3.917c-2.745-0.182-5.469-0.674-8.058-1.491c-1.129-0.355-2.257,0.004-3.19,0.821
+						l-0.139,0.336C190.091,504.331,191.291,504.045,192.325,504.368z"/>
+					<path d="M203.257,511.128c-0.209,0.363-0.485,0.675-0.815,0.938l0.005,0.014c0.828-0.462,3.538-0.444,3.239,1.143
+						c0.427-0.68,0.698-1.445,0.833-2.252C205.433,511.064,204.345,511.119,203.257,511.128z"/>
+					<path d="M199.835,516.06c2.125,0.139,4.002-0.661,5.201-1.962c1.212-1.303,1.75-3.087,1.567-4.965
+						c0.171,1.74-0.447,3.405-1.694,4.617c-1.233,1.209-3.092,1.94-5.041,1.811c-3.526-0.234-7.036-0.863-10.462-1.943
+						c-1.188-0.373-2.232-1.063-3.012-1.921l-0.192,0.464c0.77,0.851,1.812,1.543,3.054,1.935
+						C192.723,515.188,196.271,515.823,199.835,516.06z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M217.193,504.596l-1.62-3.688c-1.043,1.672-1.554,4.125-0.789,6.532
+						c0.912-0.298,1.809-0.637,2.692-1.008C217.221,505.826,217.149,505.195,217.193,504.596z"/>
+					<path d="M217.477,506.433c-0.23-0.545-0.274-1.123-0.213-1.677l-0.146-0.332C217.091,505.071,217.193,505.761,217.477,506.433z
+						"/>
+					<path fill="#988A38" d="M217.477,506.433c-0.884,0.371-1.781,0.71-2.692,1.008c0.76,2.397,2.859,4.063,5.475,4.136l-1.62-3.688
+						C218.139,507.529,217.732,507.04,217.477,506.433z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M214.829,507.426c0.137,0.421,0.583,0.649,0.993,0.506c0.408-0.146,0.61-0.605,0.452-1.018
+								c-0.544-1.454-0.398-2.966,0.103-4.177l-0.781-1.776C214.571,502.623,214.07,505.05,214.829,507.426z"/>
+							<path fill="#988A38" d="M214.821,507.163c0.116,0.379,0.512,0.584,0.875,0.462c0.363-0.124,0.542-0.527,0.409-0.898
+								c-0.499-1.449-0.304-2.941,0.208-4.133l-0.696-1.585C214.639,502.6,214.138,504.898,214.821,507.163z"/>
+							<path fill="#988A38" d="M214.814,506.9c0.094,0.333,0.439,0.519,0.758,0.418c0.316-0.104,0.474-0.453,0.368-0.781
+								c-0.453-1.443-0.213-2.917,0.31-4.087l-0.612-1.393C214.709,502.578,214.208,504.748,214.814,506.9z"/>
+							<path fill="#988A38" d="M214.803,506.638c0.076,0.29,0.371,0.455,0.646,0.376c0.272-0.085,0.408-0.381,0.326-0.667
+								c-0.404-1.439-0.121-2.889,0.413-4.042l-0.527-1.199C214.778,502.555,214.277,504.596,214.803,506.638z"/>
+							<path fill="#988A38" d="M214.793,506.377c0.057,0.243,0.301,0.39,0.53,0.329c0.229-0.063,0.347-0.307,0.286-0.548
+								c-0.351-1.439-0.032-2.865,0.514-3.997l-0.442-1.008C214.848,502.529,214.351,504.44,214.793,506.377z"/>
+							<path fill="#988A38" d="M214.781,506.114c0.039,0.199,0.232,0.326,0.419,0.286c0.184-0.045,0.287-0.237,0.245-0.436
+								c-0.292-1.437,0.058-2.837,0.615-3.948l-0.358-0.815C214.918,502.506,214.43,504.283,214.781,506.114z"/>
+							<path fill="#FEE676" d="M214.768,505.853c0.022,0.154,0.164,0.262,0.308,0.241c0.144-0.023,0.23-0.166,0.207-0.32
+								c-0.218-1.438,0.145-2.811,0.715-3.9l-0.274-0.625C214.989,502.484,214.521,504.123,214.768,505.853z"/>
+							<path fill="#FEE676" d="M214.754,505.591c0.008,0.11,0.096,0.197,0.199,0.195c0.102-0.002,0.177-0.096,0.168-0.204
+								c-0.111-1.439,0.231-2.784,0.813-3.853l-0.19-0.433C215.06,502.46,214.642,503.962,214.754,505.591z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M217.431,506.528c-0.128-0.304-0.461-0.449-0.75-0.336c-0.285,0.115-0.426,0.454-0.311,0.76
+								c0.53,1.406,1.659,2.408,3.073,2.768l-0.707-1.61C218.168,507.736,217.711,507.197,217.431,506.528z"/>
+							<path fill="#988A38" d="M217.463,506.677c-0.119-0.269-0.414-0.396-0.673-0.292c-0.253,0.106-0.376,0.412-0.269,0.683
+								c0.514,1.29,1.574,2.206,2.877,2.55l-0.631-1.438C218.208,507.826,217.748,507.316,217.463,506.677z"/>
+							<path fill="#988A38" d="M217.495,506.823c-0.109-0.231-0.368-0.34-0.593-0.244c-0.223,0.096-0.328,0.367-0.229,0.603
+								c0.497,1.172,1.486,2.003,2.681,2.335l-0.556-1.266C218.25,507.915,217.784,507.436,217.495,506.823z"/>
+							<path fill="#988A38" d="M217.527,506.973c-0.099-0.197-0.322-0.287-0.515-0.2c-0.191,0.085-0.278,0.321-0.188,0.521
+								c0.482,1.058,1.398,1.807,2.485,2.122l-0.48-1.094C218.291,508.005,217.822,507.559,217.527,506.973z"/>
+							<path fill="#988A38" d="M217.557,507.119c-0.088-0.159-0.274-0.23-0.435-0.153c-0.159,0.074-0.228,0.275-0.146,0.438
+								c0.467,0.942,1.31,1.608,2.289,1.909l-0.404-0.92C218.333,508.096,217.861,507.677,217.557,507.119z"/>
+							<path fill="#988A38" d="M217.589,507.269c-0.077-0.128-0.228-0.179-0.355-0.11c-0.126,0.066-0.177,0.229-0.105,0.357
+								c0.455,0.826,1.217,1.409,2.091,1.695l-0.329-0.748C218.374,508.184,217.902,507.795,217.589,507.269z"/>
+							<path fill="#FEE676" d="M217.621,507.418c-0.063-0.093-0.181-0.126-0.275-0.067c-0.093,0.056-0.122,0.183-0.062,0.276
+								c0.45,0.706,1.125,1.209,1.892,1.482l-0.252-0.574C218.416,508.275,217.95,507.912,217.621,507.418z"/>
+							<path fill="#FEE676" d="M217.651,507.565c-0.048-0.06-0.133-0.068-0.192-0.022c-0.06,0.048-0.069,0.133-0.021,0.194
+								c0.452,0.574,1.033,1.014,1.692,1.269l-0.175-0.399C218.456,508.363,218.011,508.017,217.651,507.565z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M214.784,507.44c0.796,2.507,2.893,4.282,5.575,4.363l-0.202-0.46C217.607,511.277,215.506,509.721,214.784,507.44z"
+							/>
+					</g>
+					<path d="M215.598,509.09c0.447,0.686,1.005,1.276,1.675,1.725c-0.713-1.027,0.608-1.897,1.665-2.246l-0.289-0.658
+						C217.649,508.351,216.63,508.741,215.598,509.09z"/>
+					<path fill="#988A38" d="M201.391,507.851c-0.76-0.028-1.429,0.592-1.497,1.417c-0.067,0.824,0.55,1.55,1.38,1.581
+						c5.881,0.223,11.84-0.783,17.273-3.168l-1.206-2.747C212.369,507.119,206.847,508.06,201.391,507.851z"/>
+					<path d="M201.284,510.6c-0.688-0.023-1.447-0.645-1.39-1.332c-0.077,0.96,0.397,1.792,1.371,1.831
+						c5.916,0.225,11.912-0.787,17.384-3.188l-0.201-0.458C213.052,509.82,207.129,510.821,201.284,510.6z"/>
+					<g>
+						<path fill="#988A38" d="M201.203,508.042c-0.408-0.016-0.763,0.317-0.792,0.761c-0.029,0.438,0.294,0.822,0.722,0.839
+							c5.772,0.248,11.625-0.729,16.933-3.06l-0.644-1.466C212.364,507.338,206.746,508.281,201.203,508.042z"/>
+						<path fill="#988A38" d="M201.625,508.118c-0.366-0.011-0.679,0.292-0.7,0.687c-0.021,0.392,0.271,0.729,0.653,0.74
+							c5.612,0.182,11.287-0.806,16.441-3.066l-0.574-1.307C212.508,507.339,207.038,508.294,201.625,508.118z"/>
+						<path fill="#988A38" d="M202.047,508.188c-0.322-0.007-0.595,0.265-0.609,0.612c-0.014,0.347,0.248,0.637,0.582,0.645
+							c5.453,0.115,10.951-0.873,15.955-3.069l-0.506-1.151C212.652,507.341,207.33,508.301,202.047,508.188z"/>
+						<path fill="#988A38" d="M202.473,508.254c-0.28-0.005-0.513,0.235-0.521,0.535c-0.008,0.301,0.22,0.547,0.509,0.552
+							c5.292,0.053,10.616-0.938,15.468-3.067l-0.437-0.995C212.796,507.34,207.622,508.307,202.473,508.254z"/>
+						<path fill="#988A38" d="M202.898,508.314c-0.235-0.001-0.43,0.205-0.433,0.457c-0.003,0.253,0.191,0.458,0.433,0.459
+							c5.133-0.004,10.283-0.995,14.986-3.06l-0.369-0.839C212.941,507.341,207.913,508.312,202.898,508.314z"/>
+						<path fill="#FEE676" d="M203.325,508.372c-0.191,0.002-0.347,0.169-0.347,0.374c0,0.207,0.16,0.371,0.355,0.369
+							c4.973-0.059,9.952-1.047,14.505-3.048l-0.298-0.68C213.088,507.345,208.205,508.315,203.325,508.372z"/>
+						<path fill="#FEE676" d="M203.754,508.423c-0.147,0.005-0.267,0.135-0.264,0.292c0.002,0.157,0.127,0.283,0.277,0.278
+							c4.813-0.107,9.621-1.096,14.026-3.03l-0.229-0.521C213.233,507.343,208.498,508.317,203.754,508.423z"/>
+						<path fill="#FEE676" d="M204.184,508.471c-0.104,0.005-0.186,0.096-0.183,0.206c0.003,0.11,0.091,0.198,0.196,0.194
+							c4.654-0.157,9.292-1.139,13.552-3.01l-0.161-0.366C213.38,507.344,208.791,508.316,204.184,508.471z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M220.95,504.804c-0.414-0.718-1.278-0.995-1.95-0.654c-0.545,0.276-1.098,0.538-1.658,0.784
+						l1.207,2.747c0.612-0.27,1.217-0.556,1.813-0.858C221.109,506.443,221.363,505.521,220.95,504.804z"/>
+					<g>
+						<path fill="#988A38" d="M220.351,504.566c-0.216-0.389-0.676-0.541-1.037-0.355c-0.62,0.322-1.251,0.624-1.892,0.905
+							l0.644,1.466c0.672-0.295,1.334-0.611,1.986-0.95C220.434,505.436,220.564,504.951,220.351,504.566z"/>
+						<path fill="#988A38" d="M220.066,504.689c-0.187-0.346-0.6-0.488-0.923-0.322c-0.558,0.284-1.124,0.552-1.697,0.804
+							l0.574,1.308c0.599-0.264,1.189-0.543,1.772-0.84C220.132,505.464,220.255,505.037,220.066,504.689z"/>
+						<path fill="#988A38" d="M219.781,504.812c-0.162-0.308-0.524-0.435-0.812-0.291c-0.493,0.249-0.994,0.481-1.5,0.704
+							l0.506,1.151c0.525-0.231,1.046-0.473,1.558-0.732C219.834,505.494,219.944,505.119,219.781,504.812z"/>
+						<path fill="#988A38" d="M219.496,504.93c-0.137-0.267-0.451-0.381-0.701-0.258c-0.429,0.212-0.863,0.414-1.302,0.606
+							l0.437,0.994c0.453-0.198,0.902-0.407,1.345-0.626C219.535,505.519,219.635,505.197,219.496,504.93z"/>
+						<path fill="#988A38" d="M219.21,505.047c-0.115-0.228-0.378-0.327-0.59-0.224c-0.365,0.175-0.732,0.347-1.104,0.509
+							l0.369,0.839c0.381-0.167,0.759-0.343,1.134-0.523C219.238,505.541,219.323,505.272,219.21,505.047z"/>
+						<path fill="#FEE676" d="M218.922,505.158c-0.09-0.184-0.304-0.267-0.479-0.185l-0.902,0.413l0.299,0.681l0.923-0.422
+							C218.942,505.561,219.013,505.345,218.922,505.158z"/>
+						<path fill="#FEE676" d="M218.634,505.27c-0.07-0.146-0.235-0.21-0.369-0.146l-0.701,0.316l0.229,0.522l0.713-0.322
+							C218.644,505.575,218.703,505.414,218.634,505.27z"/>
+						<path fill="#FEE676" d="M218.345,505.378c-0.048-0.103-0.164-0.149-0.26-0.105l-0.496,0.223l0.161,0.366l0.502-0.226
+							C218.349,505.592,218.393,505.479,218.345,505.378z"/>
+					</g>
+					<path d="M220.95,504.804c0.346,0.597-0.083,1.485-0.701,1.796c-0.592,0.302-1.193,0.585-1.8,0.853l0.201,0.458
+						c0.616-0.271,1.226-0.56,1.827-0.864C221.351,506.598,221.429,505.639,220.95,504.804z"/>
+					<path fill="#988A38" d="M232.555,504.047c1.493-1.526,2.141-3.527,1.936-5.291c-0.19-1.775-1.201-3.299-2.738-4.269
+						c-1.543-0.975-3.288-1.284-4.821-1.206c-1.539,0.094-2.87,0.568-3.817,1.54c-1.663,1.702-3.589,3.155-5.716,4.325
+						c-0.751,0.411-1.347,1.018-1.823,1.761l1.62,3.688c0.095-1.21,0.666-2.279,1.65-2.82c2.373-1.306,4.541-2.938,6.415-4.857
+						c1.146-1.166,3.245-1.269,4.84-0.081c1.59,1.181,1.827,3.562,0.311,5.113c-2.38,2.438-5.13,4.505-8.093,6.133
+						c-1.244,0.685-2.675,0.538-3.676-0.195l1.62,3.688c1.148,0.037,2.369-0.243,3.503-0.865
+						C226.971,508.948,229.964,506.702,232.555,504.047z"/>
+					<path fill="#988A38" d="M231.37,502.89c1.048-1.073,1.517-2.481,1.399-3.765c-0.111-1.29-0.794-2.446-1.887-3.222
+						c-0.267-0.19-0.631-0.143-0.813,0.105c-0.186,0.246-0.127,0.61,0.132,0.812c1.628,1.202,1.875,3.642,0.317,5.237
+						c-2.391,2.449-5.154,4.524-8.129,6.159c-1.228,0.676-2.638,0.569-3.653-0.107l0.707,1.609c1.105,0.288,2.373,0.171,3.52-0.459
+						C226.036,507.571,228.895,505.425,231.37,502.89z"/>
+					<path fill="#988A38" d="M231.306,502.828c1.01-1.034,1.477-2.383,1.387-3.621c-0.083-1.243-0.71-2.365-1.738-3.131
+						c-0.235-0.174-0.56-0.132-0.728,0.088c-0.168,0.219-0.117,0.541,0.11,0.726c1.564,1.212,1.754,3.612,0.208,5.195
+						c-2.394,2.452-5.159,4.529-8.137,6.166c-1.227,0.675-2.623,0.583-3.641-0.071l0.631,1.438c1.099,0.307,2.368,0.199,3.522-0.436
+						C225.986,507.497,228.837,505.356,231.306,502.828z"/>
+					<path fill="#988A38" d="M231.243,502.765c0.971-0.994,1.435-2.282,1.372-3.474c-0.056-1.194-0.627-2.283-1.587-3.038
+						c-0.202-0.159-0.489-0.123-0.639,0.069c-0.153,0.188-0.11,0.471,0.083,0.635c1.501,1.227,1.635,3.584,0.102,5.153
+						c-2.396,2.454-5.165,4.534-8.146,6.173c-1.227,0.675-2.606,0.598-3.627-0.032l0.556,1.266c1.091,0.324,2.36,0.227,3.523-0.412
+						C225.936,507.424,228.781,505.287,231.243,502.765z"/>
+					<path fill="#988A38" d="M231.178,502.702c0.932-0.956,1.393-2.183,1.356-3.326c-0.03-1.146-0.546-2.201-1.438-2.948
+						c-0.169-0.141-0.417-0.112-0.551,0.05c-0.135,0.161-0.102,0.403,0.061,0.549c1.437,1.241,1.515,3.555-0.006,5.11
+						c-2.399,2.458-5.171,4.539-8.156,6.18c-1.227,0.675-2.591,0.61-3.615,0.004l0.48,1.095c1.083,0.343,2.354,0.253,3.524-0.39
+						C225.885,507.349,228.722,505.218,231.178,502.702z"/>
+					<path fill="#988A38" d="M231.114,502.639c0.894-0.918,1.349-2.083,1.339-3.177c-0.008-1.1-0.464-2.114-1.287-2.857
+						c-0.137-0.124-0.345-0.103-0.462,0.029c-0.115,0.135-0.094,0.335,0.035,0.46c1.37,1.269,1.396,3.524-0.113,5.068
+						c-2.401,2.46-5.176,4.545-8.164,6.187c-1.227,0.675-2.575,0.626-3.601,0.045l0.404,0.919c1.075,0.361,2.347,0.281,3.526-0.366
+						C225.834,507.273,228.664,505.148,231.114,502.639z"/>
+					<path fill="#988A38" d="M231.05,502.577c1.742-1.783,1.704-4.347,0.185-5.795c-0.104-0.104-0.274-0.094-0.373,0.009
+						c-0.099,0.105-0.086,0.269,0.014,0.371c1.293,1.3,1.273,3.497-0.222,5.027c-2.404,2.463-5.182,4.549-8.173,6.193
+						c-1.227,0.674-2.56,0.64-3.589,0.08l0.329,0.748c1.068,0.381,2.341,0.309,3.527-0.343
+						C225.784,507.199,228.607,505.079,231.05,502.577z"/>
+					<path fill="#FEE676" d="M230.986,502.515c1.661-1.703,1.667-4.087,0.318-5.553c-0.075-0.085-0.206-0.087-0.284-0.013
+						c-0.08,0.075-0.08,0.199-0.008,0.282c1.198,1.346,1.152,3.468-0.331,4.985c-2.407,2.466-5.188,4.555-8.182,6.2
+						c-1.227,0.674-2.543,0.652-3.576,0.118l0.252,0.574c1.06,0.397,2.335,0.336,3.529-0.319
+						C225.734,507.125,228.549,505.011,230.986,502.515z"/>
+					<path fill="#FEE676" d="M230.922,502.451c1.581-1.623,1.605-3.813,0.448-5.312c-0.047-0.063-0.132-0.075-0.193-0.031
+						c-0.061,0.045-0.072,0.131-0.03,0.19c1.072,1.417,1.03,3.439-0.439,4.945c-2.41,2.468-5.194,4.56-8.191,6.207
+						c-1.226,0.673-2.528,0.666-3.562,0.156l0.175,0.399c1.054,0.419,2.329,0.365,3.531-0.295
+						C225.683,507.051,228.491,504.941,230.922,502.451z"/>
+					<path d="M219.798,508.421c-0.414-0.104-0.796-0.271-1.137-0.516l-0.012,0.005l0.29,0.658c0.326-0.103,0.632-0.158,0.859-0.134
+						V508.421z"/>
+					<path fill="#988A38" d="M224.294,495.975c0.76-0.779,1.811-1.165,2.987-1.216c1.171-0.044,2.472,0.252,3.588,1.041
+						c0.357,0.256,0.856,0.178,1.099-0.185c0.245-0.36,0.13-0.852-0.243-1.088c-1.527-0.967-3.259-1.276-4.782-1.199
+						c-1.528,0.093-2.854,0.564-3.794,1.529c-1.667,1.706-3.596,3.162-5.729,4.334c-0.752,0.412-1.35,1.021-1.825,1.769l0.781,1.777
+						c0.388-0.922,0.978-1.686,1.815-2.145C220.456,499.349,222.515,497.796,224.294,495.975z"/>
+					<path fill="#988A38" d="M224.202,495.886c0.751-0.77,1.789-1.167,2.955-1.24c1.159-0.067,2.453,0.195,3.573,0.942
+						c0.325,0.222,0.764,0.152,0.974-0.171c0.212-0.319,0.106-0.757-0.231-0.961c-1.484-0.903-3.158-1.181-4.626-1.085
+						c-1.477,0.108-2.754,0.583-3.667,1.516c-1.669,1.709-3.603,3.167-5.738,4.342c-0.752,0.411-1.35,1.024-1.824,1.78l0.696,1.585
+						c0.397-0.909,0.988-1.659,1.817-2.113C220.383,499.242,222.432,497.698,224.202,495.886z"/>
+					<path fill="#988A38" d="M224.11,495.795c0.741-0.759,1.766-1.167,2.919-1.263c1.147-0.089,2.431,0.139,3.558,0.846
+						c0.29,0.185,0.673,0.125,0.849-0.158c0.18-0.281,0.083-0.664-0.216-0.838c-1.443-0.838-3.058-1.081-4.472-0.966
+						c-1.424,0.123-2.654,0.6-3.537,1.501c-1.673,1.712-3.609,3.173-5.749,4.35c-0.753,0.411-1.35,1.029-1.823,1.79l0.612,1.393
+						c0.407-0.896,0.998-1.633,1.818-2.081C220.311,499.135,222.349,497.599,224.11,495.795z"/>
+					<path fill="#988A38" d="M224.018,495.705c0.732-0.749,1.741-1.168,2.882-1.284c1.136-0.109,2.408,0.084,3.546,0.75
+						c0.255,0.151,0.579,0.096,0.725-0.147c0.147-0.244,0.06-0.572-0.202-0.717c-1.407-0.772-2.959-0.979-4.32-0.847
+						c-1.372,0.139-2.554,0.617-3.408,1.486c-1.675,1.716-3.616,3.179-5.758,4.357c-0.754,0.412-1.35,1.034-1.823,1.801l0.527,1.2
+						c0.417-0.88,1.009-1.606,1.82-2.05C220.238,499.027,222.266,497.499,224.018,495.705z"/>
+					<path fill="#988A38" d="M223.926,495.615c0.722-0.739,1.715-1.167,2.842-1.304c1.123-0.13,2.383,0.029,3.535,0.653
+						c0.218,0.119,0.486,0.068,0.603-0.139c0.116-0.205,0.035-0.479-0.188-0.594c-1.375-0.709-2.861-0.875-4.172-0.729
+						c-1.316,0.156-2.453,0.633-3.274,1.472c-1.678,1.719-3.622,3.185-5.768,4.364c-0.754,0.413-1.35,1.04-1.822,1.813l0.442,1.008
+						c0.427-0.867,1.02-1.58,1.821-2.019C220.166,498.921,222.183,497.399,223.926,495.615z"/>
+					<path fill="#988A38" d="M223.834,495.526c0.712-0.73,1.688-1.164,2.799-1.319c1.109-0.149,2.357-0.022,3.523,0.55
+						c0.184,0.092,0.396,0.043,0.48-0.127c0.087-0.17,0.015-0.387-0.171-0.475c-1.344-0.635-2.77-0.766-4.025-0.603
+						c-1.258,0.174-2.348,0.645-3.138,1.454c-1.682,1.721-3.629,3.189-5.778,4.372c-0.755,0.412-1.35,1.044-1.822,1.822l0.358,0.815
+						c0.436-0.855,1.03-1.553,1.823-1.986C220.093,498.814,222.101,497.301,223.834,495.526z"/>
+					<path fill="#FEE676" d="M223.743,495.437c0.703-0.72,1.657-1.155,2.754-1.326c1.095-0.164,2.331-0.068,3.517,0.442
+						c0.142,0.062,0.301,0.013,0.358-0.119c0.056-0.134-0.011-0.296-0.156-0.356c-1.316-0.548-2.684-0.646-3.883-0.468
+						c-1.201,0.188-2.24,0.648-2.999,1.428c-1.685,1.725-3.635,3.195-5.788,4.38c-0.755,0.412-1.351,1.05-1.823,1.83l0.275,0.625
+						c0.446-0.84,1.04-1.525,1.824-1.954C220.021,498.708,222.018,497.202,223.743,495.437z"/>
+					<path fill="#FEE676" d="M223.651,495.347c0.693-0.711,1.624-1.135,2.702-1.314c1.078-0.173,2.303-0.104,3.514,0.317
+						c0.103,0.035,0.209-0.015,0.236-0.113c0.028-0.098-0.034-0.204-0.138-0.239c-1.295-0.434-2.607-0.501-3.747-0.313
+						c-1.146,0.192-2.126,0.637-2.854,1.383c-1.688,1.729-3.642,3.202-5.798,4.388c-0.755,0.413-1.351,1.054-1.821,1.843
+						l0.189,0.432c0.456-0.826,1.051-1.5,1.825-1.923C219.949,498.601,221.935,497.104,223.651,495.347z"/>
+					<path d="M218.965,501.995c2.394-1.316,4.583-2.965,6.474-4.901c1.08-1.091,3.176-1.357,4.66-0.257
+						c-1.707-1.275-3.807-1.332-5.018-0.093c-1.856,1.901-4.004,3.52-6.357,4.813c-1.025,0.562-1.549,1.626-1.606,2.866l0.146,0.332
+						C217.401,503.58,218.026,502.514,218.965,501.995z"/>
+					<path d="M231.215,498.362c0.144,0.393,0.21,0.81,0.197,1.235l0.013,0.005c0.194-0.937,1.915-3.076,2.969-1.856
+						c-0.263-0.768-0.692-1.455-1.247-2.062C232.551,496.61,231.902,497.501,231.215,498.362z"/>
+					<path d="M232.734,504.222c1.557-1.594,2.181-3.629,1.931-5.407c-0.237-1.792-1.32-3.321-2.913-4.327
+						c1.477,0.931,2.415,2.451,2.559,4.212c0.159,1.749-0.509,3.718-1.935,5.173c-2.574,2.637-5.546,4.868-8.734,6.62
+						c-1.107,0.607-2.329,0.888-3.485,0.852l0.202,0.46c1.143,0.038,2.366-0.238,3.524-0.873
+						C227.112,509.156,230.125,506.895,232.734,504.222z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M234.206,483.447l-3.998-0.488c0.959,1.374,2.665,3.214,5.075,3.971
+						c0.283-0.914,0.499-1.848,0.645-2.792C235.277,484.039,234.703,483.771,234.206,483.447z"/>
+					<path d="M235.927,484.138c-0.584-0.089-1.099-0.351-1.547-0.669l-0.36-0.044C234.567,483.756,235.207,484.028,235.927,484.138z
+						"/>
+					<path fill="#988A38" d="M235.927,484.138c-0.146,0.944-0.362,1.878-0.645,2.792c2.379,0.754,5.026,0.018,6.491-2.56
+						l-3.998-0.487C237.211,484.146,236.579,484.237,235.927,484.138z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M235.297,486.884c0.424,0.132,0.866-0.105,0.982-0.53c0.112-0.427-0.159-0.853-0.592-0.947
+								c-1.513-0.349-2.691-1.293-3.495-2.205l-1.927-0.234C231.225,484.329,232.914,486.142,235.297,486.884z"/>
+							<path fill="#988A38" d="M235.078,486.74c0.375,0.123,0.771-0.085,0.877-0.456c0.102-0.373-0.133-0.747-0.517-0.841
+								c-1.484-0.387-2.62-1.356-3.402-2.261l-1.719-0.21C231.236,484.277,232.829,486.002,235.078,486.74z"/>
+							<path fill="#988A38" d="M234.857,486.596c0.327,0.114,0.677-0.062,0.775-0.382c0.09-0.321-0.109-0.646-0.442-0.739
+								c-1.453-0.424-2.55-1.415-3.311-2.312l-1.51-0.184C231.249,484.226,232.746,485.861,234.857,486.596z"/>
+							<path fill="#988A38" d="M234.635,486.455c0.281,0.104,0.585-0.041,0.675-0.312c0.083-0.271-0.088-0.548-0.371-0.638
+								c-1.422-0.464-2.476-1.472-3.217-2.361l-1.301-0.159C231.261,484.174,232.662,485.721,234.635,486.455z"/>
+							<path fill="#988A38" d="M234.415,486.315c0.232,0.092,0.492-0.021,0.572-0.242c0.075-0.223-0.061-0.454-0.296-0.538
+								c-1.39-0.509-2.406-1.527-3.125-2.41l-1.092-0.133C231.271,484.119,232.577,485.575,234.415,486.315z"/>
+							<path fill="#988A38" d="M234.193,486.178c0.186,0.08,0.399-0.003,0.472-0.177c0.065-0.174-0.037-0.363-0.225-0.438
+								c-1.353-0.559-2.334-1.581-3.029-2.456l-0.884-0.108C231.284,484.066,232.49,485.427,234.193,486.178z"/>
+							<path fill="#FEE676" d="M233.969,486.041c0.14,0.068,0.309,0.016,0.373-0.111c0.06-0.128-0.011-0.276-0.151-0.342
+								c-1.312-0.619-2.266-1.633-2.937-2.501l-0.678-0.083C231.3,484.014,232.405,485.27,233.969,486.041z"/>
+							<path fill="#FEE676" d="M233.746,485.906c0.095,0.055,0.217,0.033,0.272-0.049c0.055-0.083,0.019-0.194-0.076-0.246
+								c-1.254-0.705-2.198-1.681-2.844-2.543l-0.469-0.058C231.312,483.96,232.336,485.089,233.746,485.906z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M235.985,484.227c-0.327-0.049-0.621,0.153-0.682,0.451c-0.054,0.297,0.15,0.597,0.47,0.669
+								c1.461,0.331,2.939-0.067,3.988-1.222l-1.746-0.213C237.403,484.231,236.701,484.339,235.985,484.227z"/>
+							<path fill="#988A38" d="M236.128,484.277c-0.293-0.039-0.551,0.143-0.604,0.412c-0.044,0.268,0.143,0.536,0.427,0.595
+								c1.358,0.278,2.72-0.118,3.699-1.173l-1.559-0.19C237.5,484.244,236.821,484.371,236.128,484.277z"/>
+							<path fill="#988A38" d="M236.27,484.327c-0.255-0.028-0.479,0.134-0.522,0.373c-0.039,0.238,0.133,0.474,0.384,0.519
+								c1.251,0.226,2.5-0.165,3.41-1.121l-1.372-0.167C237.598,484.255,236.942,484.404,236.27,484.327z"/>
+							<path fill="#988A38" d="M236.414,484.379c-0.221-0.019-0.411,0.124-0.444,0.332c-0.031,0.207,0.121,0.408,0.338,0.441
+								c1.146,0.17,2.285-0.208,3.124-1.067l-1.186-0.146C237.697,484.266,237.066,484.437,236.414,484.379z"/>
+							<path fill="#988A38" d="M236.554,484.431c-0.183-0.009-0.338,0.113-0.361,0.291c-0.023,0.174,0.109,0.342,0.291,0.362
+								c1.044,0.117,2.071-0.25,2.837-1.013l-0.997-0.122C237.795,484.277,237.187,484.466,236.554,484.431z"/>
+							<path fill="#988A38" d="M236.698,484.482c-0.149-0.002-0.27,0.103-0.282,0.247c-0.012,0.143,0.097,0.274,0.244,0.283
+								c0.94,0.062,1.851-0.286,2.55-0.955l-0.811-0.099C237.893,484.288,237.31,484.493,236.698,484.482z"/>
+							<path fill="#FEE676" d="M236.841,484.535c-0.113,0.006-0.201,0.091-0.204,0.202c-0.003,0.11,0.087,0.204,0.199,0.204
+								c0.837-0.003,1.631-0.324,2.263-0.897l-0.623-0.076C237.992,484.299,237.434,484.515,236.841,484.535z"/>
+							<path fill="#FEE676" d="M236.983,484.589c-0.077,0.01-0.128,0.079-0.123,0.155c0.008,0.078,0.074,0.132,0.152,0.124
+								c0.728-0.08,1.415-0.357,1.975-0.838l-0.433-0.053C238.089,484.31,237.555,484.518,236.983,484.589z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M235.283,486.93c2.488,0.786,5.226,0.121,6.738-2.529l-0.499-0.061C240.101,486.843,237.545,487.648,235.283,486.93z"
+							/>
+					</g>
+					<path d="M237.101,487.207c0.819,0.02,1.629-0.114,2.382-0.453c-1.253,0.03-1.209-1.616-0.97-2.781l-0.714-0.087
+						C237.662,485.026,237.414,486.133,237.101,487.207z"/>
+					<path fill="#988A38" d="M227.5,497.706c-0.503,0.555-0.467,1.46,0.113,2.051c0.579,0.591,1.538,0.593,2.104-0.03
+						c2.008-2.2,3.696-4.686,5.029-7.337c1.305-2.644,2.417-5.445,2.804-8.534l-2.978-0.363c-0.311,2.552-1.302,5.119-2.506,7.549
+						C230.851,493.461,229.318,495.713,227.5,497.706z"/>
+					<path d="M229.533,499.559c-0.466,0.519-1.438,0.692-1.919,0.198c0.675,0.687,1.623,0.868,2.289,0.139
+						c2.023-2.218,3.726-4.723,5.068-7.393c1.313-2.662,2.435-5.483,2.829-8.617l-0.497-0.061c-0.381,3.044-1.483,5.826-2.779,8.452
+						C233.201,494.91,231.525,497.375,229.533,499.559z"/>
+					<g>
+						<path fill="#988A38" d="M227.526,497.972c-0.271,0.298-0.248,0.78,0.07,1.091c0.314,0.308,0.817,0.312,1.106-0.006
+							c1.967-2.127,3.622-4.539,4.929-7.122c1.288-2.582,2.365-5.319,2.728-8.225l-1.589-0.194
+							c-0.321,2.615-1.334,5.229-2.567,7.695C230.959,493.669,229.391,495.955,227.526,497.972z"/>
+						<path fill="#988A38" d="M227.854,497.701c-0.241,0.27-0.21,0.702,0.077,0.975c0.285,0.269,0.731,0.264,0.986-0.021
+							c1.88-2.094,3.459-4.452,4.714-6.968c1.23-2.519,2.27-5.179,2.616-7.99l-1.417-0.173c-0.311,2.561-1.293,5.107-2.477,7.526
+							C231.152,493.457,229.646,495.705,227.854,497.701z"/>
+						<path fill="#988A38" d="M228.177,497.426c-0.211,0.238-0.175,0.623,0.081,0.856c0.256,0.234,0.645,0.22,0.867-0.031
+							c1.791-2.06,3.304-4.359,4.502-6.812c1.181-2.451,2.177-5.037,2.509-7.757l-1.248-0.152c-0.302,2.505-1.248,4.988-2.39,7.355
+							C231.346,493.245,229.894,495.45,228.177,497.426z"/>
+						<path fill="#988A38" d="M228.495,497.143c-0.18,0.211-0.142,0.542,0.082,0.739c0.227,0.199,0.561,0.182,0.748-0.038
+							c1.711-2.02,3.148-4.268,4.297-6.65c1.134-2.386,2.083-4.896,2.403-7.524l-1.079-0.132c-0.295,2.448-1.201,4.869-2.303,7.184
+							C231.531,493.026,230.145,495.195,228.495,497.143z"/>
+						<path fill="#988A38" d="M228.81,496.855c-0.146,0.182-0.112,0.46,0.079,0.622c0.193,0.164,0.477,0.147,0.629-0.04
+							c1.628-1.981,3.001-4.17,4.099-6.489c1.088-2.318,1.988-4.756,2.297-7.293l-0.909-0.11c-0.29,2.391-1.153,4.748-2.216,7.011
+							C231.722,492.811,230.388,494.935,228.81,496.855z"/>
+						<path fill="#FEE676" d="M229.124,496.565c-0.116,0.15-0.09,0.371,0.071,0.503c0.158,0.13,0.39,0.114,0.51-0.04
+							c1.548-1.94,2.853-4.073,3.902-6.325c1.042-2.253,1.898-4.613,2.196-7.062l-0.737-0.09c-0.283,2.334-1.109,4.628-2.132,6.838
+							C231.908,492.591,230.633,494.672,229.124,496.565z"/>
+						<path fill="#FEE676" d="M229.432,496.268c-0.09,0.115-0.064,0.289,0.059,0.387c0.123,0.098,0.299,0.08,0.392-0.037
+							c1.471-1.898,2.711-3.974,3.71-6.159c0.996-2.188,1.808-4.471,2.095-6.83l-0.566-0.069c-0.277,2.275-1.065,4.506-2.047,6.662
+							C232.093,492.369,230.875,494.406,229.432,496.268z"/>
+						<path fill="#FEE676" d="M229.736,495.966c-0.062,0.082-0.042,0.203,0.046,0.27c0.088,0.067,0.21,0.054,0.273-0.029
+							c1.395-1.854,2.572-3.872,3.527-5.99c0.951-2.123,1.718-4.328,1.996-6.602l-0.397-0.048c-0.271,2.216-1.021,4.383-1.963,6.485
+							C232.274,492.145,231.113,494.136,229.736,495.966z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M236.084,480.328c-0.818,0.116-1.414,0.756-1.41,1.445c0.005,0.568-0.031,1.143-0.101,1.718
+						l2.978,0.363c0.084-0.692,0.129-1.396,0.124-2.105C237.667,480.851,236.905,480.21,236.084,480.328z"/>
+					<g>
+						<path fill="#988A38" d="M235.628,480.779c-0.438,0.043-0.769,0.38-0.757,0.757c0.017,0.654-0.019,1.316-0.1,1.979l1.589,0.193
+							c0.089-0.733,0.13-1.476,0.111-2.217C236.458,481.055,236.07,480.735,235.628,480.779z"/>
+						<path fill="#988A38" d="M235.625,481.08c-0.397,0.028-0.69,0.324-0.692,0.664c0.006,0.589-0.031,1.184-0.103,1.778
+							l1.417,0.173c0.08-0.651,0.12-1.309,0.114-1.966C236.362,481.343,236.015,481.052,235.625,481.08z"/>
+						<path fill="#988A38" d="M235.614,481.381c-0.343,0.016-0.622,0.272-0.619,0.574c-0.005,0.522-0.042,1.049-0.106,1.575
+							l1.248,0.152c0.069-0.57,0.11-1.145,0.115-1.719C236.25,481.626,235.965,481.364,235.614,481.381z"/>
+						<path fill="#988A38" d="M235.594,481.681c-0.303,0.007-0.537,0.225-0.547,0.488c-0.008,0.454-0.045,0.911-0.101,1.368
+							l1.079,0.132c0.06-0.49,0.1-0.982,0.108-1.476C236.144,481.904,235.893,481.674,235.594,481.681z"/>
+						<path fill="#988A38" d="M235.572,481.98c-0.255-0.001-0.466,0.18-0.471,0.402c-0.019,0.387-0.047,0.773-0.097,1.161
+							l0.909,0.111c0.053-0.411,0.083-0.823,0.103-1.236C236.022,482.178,235.825,481.98,235.572,481.98z"/>
+						<path fill="#FEE676" d="M235.537,482.278c-0.206-0.005-0.375,0.139-0.388,0.322c-0.016,0.316-0.045,0.634-0.085,0.951
+							l0.738,0.09c0.042-0.332,0.072-0.666,0.089-0.999C235.905,482.446,235.744,482.284,235.537,482.278z"/>
+						<path fill="#FEE676" d="M235.5,482.576c-0.161-0.008-0.296,0.102-0.305,0.244c-0.021,0.246-0.039,0.492-0.073,0.738
+							l0.567,0.069c0.035-0.255,0.053-0.512,0.075-0.768C235.773,482.711,235.66,482.584,235.5,482.576z"/>
+						<path fill="#FEE676" d="M235.456,482.873c-0.111-0.008-0.209,0.067-0.219,0.169l-0.055,0.523l0.397,0.049l0.057-0.538
+							C235.646,482.972,235.569,482.881,235.456,482.873z"/>
+					</g>
+					<path d="M236.084,480.328c0.688-0.102,1.34,0.688,1.341,1.423c0.005,0.697-0.038,1.391-0.122,2.073l0.497,0.061
+						c0.085-0.702,0.131-1.417,0.125-2.138C237.911,480.683,237.036,480.195,236.084,480.328z"/>
+					<path fill="#988A38" d="M236.271,467.917c-1.903-1.653-4.137-2.073-5.884-1.637c-1.772,0.431-3.107,1.679-3.823,3.35
+						c-0.719,1.676-0.738,3.452-0.479,4.934c0.276,1.487,0.791,2.67,1.512,3.305c0.651,0.562,1.078,1.113,1.367,1.645
+						c0.28,0.536,0.452,1.079,0.514,1.79c0.045,0.493,0.312,1.031,0.729,1.656l3.998,0.487c-0.995-0.672-1.657-1.537-1.739-2.41
+						c-0.186-2.138-1.141-3.897-2.898-5.43c-0.538-0.471-0.854-1.188-0.958-2.032c-0.099-0.839,0.025-1.803,0.52-2.663
+						c0.494-0.859,1.293-1.445,2.243-1.61c0.941-0.167,2.038,0.103,2.926,0.878c1.415,1.225,2.707,2.747,3.656,4.517
+						c0.95,1.767,1.521,3.758,1.683,5.7c0.148,1.646-0.719,2.979-1.863,3.486l3.998,0.488c0.636-1.125,1-2.605,0.853-4.241
+						c-0.192-2.305-0.88-4.722-2.027-6.851C239.457,471.146,237.911,469.337,236.271,467.917z"/>
+					<path fill="#988A38" d="M235.182,469.166c-1.29-1.124-2.853-1.474-4.139-1.208c-1.3,0.262-2.343,1.116-2.948,2.312
+						c-0.148,0.294-0.045,0.642,0.222,0.788c0.27,0.144,0.618,0.039,0.782-0.245c0.502-0.881,1.319-1.484,2.292-1.653
+						c0.966-0.173,2.093,0.106,3.008,0.904c1.427,1.234,2.731,2.771,3.69,4.56c0.96,1.785,1.537,3.799,1.701,5.759
+						c0.147,1.633-0.672,2.975-1.775,3.529l1.746,0.213c0.821-0.892,1.357-2.268,1.216-3.848c-0.176-2.104-0.799-4.287-1.837-6.216
+						C238.104,472.129,236.698,470.478,235.182,469.166z"/>
+					<path fill="#988A38" d="M235.125,469.233c-1.24-1.08-2.74-1.438-3.985-1.207c-1.258,0.226-2.278,1.018-2.882,2.146
+						c-0.138,0.26-0.047,0.57,0.191,0.708c0.242,0.13,0.55,0.039,0.702-0.212c0.509-0.849,1.326-1.421,2.292-1.567
+						c0.959-0.15,2.074,0.141,2.984,0.935c1.429,1.237,2.737,2.778,3.699,4.57c0.962,1.789,1.54,3.809,1.705,5.773
+						c0.146,1.634-0.653,2.971-1.738,3.542l1.559,0.19c0.833-0.872,1.38-2.241,1.238-3.827c-0.175-2.094-0.794-4.263-1.826-6.181
+						C238.032,472.182,236.634,470.54,235.125,469.233z"/>
+					<path fill="#988A38" d="M235.065,469.3c-1.19-1.035-2.625-1.402-3.829-1.204c-1.215,0.193-2.21,0.919-2.816,1.979
+						c-0.127,0.224-0.049,0.501,0.161,0.623c0.21,0.12,0.481,0.043,0.617-0.171c0.52-0.819,1.336-1.36,2.295-1.483
+						c0.952-0.128,2.052,0.176,2.956,0.964c1.433,1.239,2.744,2.784,3.708,4.581c0.965,1.794,1.544,3.817,1.709,5.787
+						c0.146,1.636-0.631,2.967-1.697,3.555l1.372,0.168c0.844-0.853,1.401-2.212,1.259-3.806c-0.174-2.083-0.79-4.24-1.816-6.147
+						C237.958,472.234,236.568,470.601,235.065,469.3z"/>
+					<path fill="#988A38" d="M235.006,469.368c-1.141-0.993-2.51-1.363-3.67-1.201c-1.171,0.158-2.143,0.821-2.751,1.812
+						c-0.113,0.187-0.051,0.431,0.128,0.539c0.181,0.107,0.414,0.045,0.536-0.137c0.53-0.788,1.346-1.297,2.297-1.398
+						c0.945-0.104,2.031,0.21,2.929,0.994c1.436,1.242,2.75,2.79,3.716,4.592c0.967,1.799,1.548,3.828,1.713,5.803
+						c0.146,1.637-0.613,2.961-1.659,3.567l1.186,0.145c0.855-0.832,1.421-2.185,1.279-3.784c-0.173-2.071-0.786-4.216-1.807-6.112
+						C237.885,472.288,236.502,470.663,235.006,469.368z"/>
+					<path fill="#988A38" d="M234.947,469.437c-1.092-0.953-2.395-1.322-3.51-1.197c-1.125,0.124-2.073,0.728-2.687,1.646
+						c-0.103,0.153-0.053,0.36,0.096,0.455c0.151,0.093,0.345,0.046,0.452-0.099c0.547-0.757,1.362-1.235,2.304-1.314
+						c0.936-0.081,2.005,0.244,2.897,1.023c1.438,1.244,2.755,2.796,3.724,4.602c0.97,1.804,1.552,3.838,1.718,5.816
+						c0.146,1.639-0.59,2.957-1.617,3.581l0.997,0.121c0.866-0.812,1.442-2.156,1.299-3.763c-0.173-2.061-0.782-4.192-1.796-6.077
+						C237.811,472.342,236.436,470.725,234.947,469.437z"/>
+					<path fill="#988A38" d="M234.888,469.503c-1.043-0.911-2.277-1.281-3.348-1.188c-1.078,0.09-2.001,0.632-2.625,1.478
+						c-0.088,0.118-0.053,0.286,0.064,0.368c0.12,0.081,0.278,0.048,0.367-0.064c0.566-0.722,1.382-1.167,2.314-1.225
+						c0.926-0.059,1.977,0.274,2.863,1.049c1.441,1.247,2.762,2.802,3.733,4.613c0.973,1.808,1.557,3.848,1.723,5.831
+						c0.145,1.639-0.571,2.952-1.58,3.593l0.811,0.099c0.879-0.792,1.465-2.129,1.321-3.741c-0.171-2.05-0.777-4.169-1.786-6.043
+						C237.738,472.395,236.371,470.786,234.888,469.503z"/>
+					<path fill="#FEE676" d="M234.83,469.57c-0.996-0.868-2.155-1.234-3.177-1.171c-1.028,0.061-1.925,0.534-2.569,1.3
+						c-0.074,0.087-0.056,0.216,0.029,0.283c0.088,0.066,0.21,0.052,0.282-0.031c1.156-1.369,3.355-1.612,5.154-0.06
+						c1.444,1.249,2.768,2.809,3.742,4.624c0.975,1.812,1.561,3.858,1.727,5.846c0.145,1.641-0.551,2.947-1.541,3.606l0.623,0.076
+						c0.89-0.773,1.486-2.103,1.343-3.72c-0.171-2.04-0.772-4.146-1.775-6.01C237.666,472.447,236.305,470.848,234.83,469.57z"/>
+					<path fill="#FEE676" d="M234.771,469.638c-1.938-1.664-4.209-1.38-5.52-0.029c-0.057,0.056-0.056,0.142-0.003,0.195
+						c0.055,0.054,0.141,0.055,0.195,0.004c1.246-1.252,3.346-1.481,5.131,0.056c1.446,1.252,2.773,2.814,3.75,4.635
+						c0.978,1.816,1.564,3.868,1.731,5.86c0.145,1.642-0.53,2.942-1.501,3.619l0.433,0.053c0.903-0.754,1.509-2.074,1.365-3.698
+						c-0.17-2.028-0.769-4.122-1.766-5.976C237.592,472.5,236.24,470.909,234.771,469.638z"/>
+					<path d="M238.821,483.086c-0.294,0.34-0.632,0.607-1.02,0.786l-0.001,0.013l0.714,0.087c0.082-0.355,0.186-0.675,0.319-0.878
+						L238.821,483.086z"/>
+					<path fill="#988A38" d="M228.682,476.624c-0.65-0.57-1.062-1.503-1.232-2.644c-0.163-1.132-0.067-2.461,0.546-3.684
+						c0.201-0.391,0.047-0.874-0.351-1.059c-0.396-0.186-0.863,0.004-1.037,0.41c-0.715,1.66-0.736,3.423-0.48,4.895
+						c0.274,1.478,0.784,2.653,1.503,3.287c0.654,0.565,1.086,1.121,1.378,1.658c0.283,0.543,0.457,1.093,0.521,1.81
+						c0.046,0.497,0.313,1.04,0.735,1.668l1.927,0.235c-0.608-0.706-1.008-1.382-1.069-2.045
+						C230.95,479.355,230.231,477.997,228.682,476.624z"/>
+					<path fill="#988A38" d="M228.598,476.721c-0.637-0.559-1.054-1.476-1.243-2.601c-0.183-1.115-0.118-2.432,0.453-3.652
+						c0.17-0.354,0.034-0.778-0.318-0.936c-0.35-0.16-0.765,0.013-0.912,0.377c-0.661,1.61-0.66,3.307-0.397,4.722
+						c0.278,1.422,0.778,2.553,1.478,3.168c0.657,0.568,1.092,1.128,1.387,1.67c0.286,0.548,0.461,1.104,0.525,1.826
+						c0.046,0.5,0.317,1.046,0.746,1.678l1.719,0.21c-0.593-0.703-0.981-1.367-1.041-2.016
+						C230.824,479.399,230.127,478.079,228.598,476.721z"/>
+					<path fill="#988A38" d="M228.513,476.817c-0.624-0.547-1.044-1.447-1.252-2.555c-0.201-1.1-0.167-2.401,0.361-3.622
+						c0.14-0.316,0.021-0.683-0.285-0.814c-0.306-0.134-0.669,0.023-0.792,0.345c-0.604,1.56-0.58,3.189-0.311,4.548
+						c0.284,1.365,0.774,2.45,1.453,3.047c0.661,0.571,1.1,1.135,1.397,1.683c0.289,0.553,0.466,1.115,0.53,1.843
+						c0.046,0.503,0.322,1.052,0.755,1.688l1.51,0.185c-0.577-0.698-0.955-1.352-1.013-1.984
+						C230.697,479.443,230.023,478.16,228.513,476.817z"/>
+					<path fill="#988A38" d="M228.428,476.915c-0.611-0.536-1.036-1.419-1.26-2.51c-0.218-1.082-0.215-2.368,0.271-3.593
+						c0.112-0.277,0.007-0.585-0.255-0.693c-0.263-0.107-0.574,0.033-0.675,0.312c-0.545,1.512-0.498,3.075-0.224,4.377
+						c0.288,1.309,0.772,2.346,1.429,2.925c0.664,0.574,1.106,1.142,1.407,1.694c0.292,0.559,0.471,1.126,0.535,1.859
+						c0.046,0.506,0.327,1.058,0.765,1.698l1.301,0.158c-0.559-0.694-0.928-1.336-0.984-1.953
+						C230.57,479.486,229.918,478.242,228.428,476.915z"/>
+					<path fill="#988A38" d="M228.344,477.012c-0.598-0.524-1.026-1.388-1.266-2.461c-0.234-1.065-0.262-2.334,0.18-3.566
+						c0.085-0.234-0.006-0.488-0.227-0.572c-0.22-0.084-0.479,0.044-0.557,0.279c-0.484,1.466-0.414,2.963-0.135,4.21
+						c0.291,1.251,0.769,2.24,1.403,2.801c0.667,0.577,1.113,1.148,1.417,1.706c0.294,0.563,0.475,1.137,0.54,1.875
+						c0.047,0.509,0.332,1.064,0.776,1.708l1.092,0.134c-0.542-0.69-0.9-1.321-0.956-1.923
+						C230.443,479.53,229.814,478.324,228.344,477.012z"/>
+					<path fill="#988A38" d="M228.26,477.108c-0.585-0.514-1.014-1.356-1.268-2.41c-0.248-1.048-0.305-2.299,0.084-3.538
+						c0.062-0.194-0.019-0.395-0.198-0.453c-0.179-0.062-0.384,0.05-0.44,0.245c-0.416,1.423-0.326,2.854-0.041,4.042
+						c0.293,1.193,0.762,2.133,1.375,2.675c1.335,1.209,1.813,2.156,1.971,3.61c0.047,0.513,0.335,1.071,0.784,1.719l0.884,0.107
+						c-0.529-0.686-0.874-1.306-0.928-1.893C230.317,479.573,229.71,478.405,228.26,477.108z"/>
+					<path fill="#FEE676" d="M228.175,477.205c-0.573-0.503-0.997-1.323-1.261-2.358c-0.259-1.029-0.343-2.265-0.019-3.512
+						c0.04-0.148-0.032-0.299-0.17-0.335c-0.138-0.035-0.29,0.06-0.326,0.212c-0.338,1.384-0.227,2.748,0.061,3.878
+						c0.297,1.133,0.747,2.024,1.339,2.546c1.342,1.215,1.827,2.175,1.986,3.64c0.047,0.515,0.34,1.078,0.791,1.729l0.678,0.083
+						c-0.511-0.682-0.846-1.29-0.899-1.862C230.19,479.617,229.606,478.487,228.175,477.205z"/>
+					<path fill="#FEE676" d="M228.091,477.302c-0.561-0.491-0.97-1.289-1.236-2.304c-0.263-1.01-0.368-2.229-0.137-3.487
+						c0.02-0.106-0.044-0.204-0.145-0.216c-0.099-0.013-0.195,0.066-0.214,0.175c-0.232,1.349-0.099,2.645,0.181,3.718
+						c0.291,1.075,0.716,1.914,1.288,2.416c1.349,1.22,1.841,2.192,2,3.668c0.048,0.519,0.344,1.084,0.802,1.739l0.468,0.057
+						c-0.496-0.677-0.82-1.274-0.871-1.831C230.064,479.66,229.502,478.568,228.091,477.302z"/>
+					<path d="M232.717,481.014c-0.189-2.2-1.187-4.035-2.984-5.597c-0.51-0.449-0.849-1.15-0.99-1.968
+						c-0.138-0.812-0.072-1.737,0.388-2.538c-0.53,0.921-0.711,1.922-0.652,2.788c0.066,0.871,0.361,1.604,0.926,2.095
+						c1.719,1.503,2.63,3.188,2.814,5.264c0.082,0.894,0.759,1.716,1.8,2.366l0.361,0.044
+						C233.436,482.774,232.8,481.863,232.717,481.014z"/>
+					<path d="M230.501,469.606c0.375-0.189,0.79-0.304,1.228-0.329l0.002-0.014c-0.506-0.043-1.339-0.455-1.934-0.967
+						c-0.59-0.514-0.96-1.163-0.417-1.78c-0.728,0.371-1.344,0.903-1.855,1.546C228.539,468.51,229.536,469.019,230.501,469.606z"/>
+					<path d="M236.435,467.729c-1.998-1.729-4.263-2.113-6.019-1.63c-1.781,0.481-3.11,1.799-3.853,3.531
+						c0.687-1.605,2.031-2.781,3.794-3.163c1.74-0.389,3.942,0.066,5.748,1.639c1.622,1.403,3.146,3.188,4.273,5.291
+						c1.13,2.099,1.809,4.48,1.998,6.755c0.144,1.585-0.217,3.062-0.855,4.188l0.499,0.062c0.634-1.125,1.005-2.613,0.854-4.294
+						c-0.194-2.334-0.892-4.788-2.056-6.946C239.661,470.997,238.094,469.164,236.435,467.729z"/>
+				</g>
+				<g>
+					<path fill="#988A38" d="M214.991,468.459l-0.565,3.988c1.945-0.847,3.619-2.735,4.129-5.209
+						c-0.941-0.187-1.888-0.35-2.837-0.491C215.621,467.399,215.352,467.974,214.991,468.459z"/>
+					<path d="M215.718,466.747c-0.088,0.587-0.352,1.101-0.703,1.539l-0.051,0.359C215.337,468.109,215.61,467.47,215.718,466.747z"
+						/>
+					<path fill="#988A38" d="M215.718,466.747c0.949,0.142,1.896,0.305,2.837,0.491c0.51-2.467-0.48-4.92-2.495-6.327l-0.565,3.987
+						C215.724,465.465,215.815,466.097,215.718,466.747z"/>
+					<g>
+						<g>
+							<path fill="#988A38" d="M218.509,467.229c0.087-0.433-0.186-0.852-0.61-0.93c-0.422-0.079-0.827,0.219-0.899,0.652
+								c-0.265,1.529-1.167,2.76-2.293,3.518l-0.272,1.922C216.354,471.541,218.007,469.672,218.509,467.229z"/>
+							<path fill="#988A38" d="M218.386,467.461c0.084-0.385-0.157-0.762-0.531-0.834c-0.374-0.074-0.731,0.187-0.805,0.572
+								c-0.299,1.501-1.237,2.692-2.366,3.425l-0.243,1.714C216.274,471.523,217.871,469.769,218.386,467.461z"/>
+							<path fill="#988A38" d="M218.263,467.694c0.082-0.335-0.127-0.67-0.453-0.739c-0.325-0.069-0.638,0.155-0.712,0.491
+								c-0.335,1.473-1.305,2.625-2.435,3.334l-0.213,1.506C216.194,471.505,217.734,469.864,218.263,467.694z"/>
+							<path fill="#988A38" d="M218.143,467.928c0.077-0.289-0.099-0.581-0.377-0.646c-0.278-0.064-0.545,0.125-0.62,0.414
+								c-0.374,1.445-1.373,2.554-2.506,3.241l-0.184,1.298C216.113,471.487,217.595,469.962,218.143,467.928z"/>
+							<path fill="#988A38" d="M218.021,468.159c0.07-0.238-0.069-0.489-0.301-0.55c-0.231-0.06-0.457,0.091-0.526,0.331
+								c-0.42,1.421-1.441,2.488-2.577,3.151l-0.154,1.09C216.031,471.471,217.45,470.062,218.021,468.159z"/>
+							<path fill="#988A38" d="M217.901,468.393c0.065-0.191-0.041-0.4-0.226-0.456c-0.184-0.054-0.37,0.06-0.434,0.252
+								c-0.47,1.392-1.508,2.421-2.645,3.06l-0.125,0.882C215.949,471.452,217.299,470.159,217.901,468.393z"/>
+							<path fill="#FEE676" d="M217.782,468.626c0.057-0.145-0.014-0.31-0.151-0.362c-0.138-0.051-0.287,0.028-0.344,0.173
+								c-0.535,1.355-1.574,2.353-2.713,2.968l-0.096,0.676C215.868,471.433,217.136,470.25,217.782,468.626z"/>
+							<path fill="#FEE676" d="M217.663,468.859c0.048-0.099,0.014-0.219-0.078-0.267c-0.092-0.049-0.205-0.007-0.251,0.091
+								c-0.63,1.304-1.64,2.285-2.781,2.876l-0.066,0.467C215.785,471.416,216.945,470.327,217.663,468.859z"/>
+						</g>
+						<g>
+							<path fill="#988A38" d="M215.807,466.688c-0.048,0.324,0.162,0.627,0.472,0.675c0.307,0.05,0.602-0.172,0.657-0.494
+								c0.254-1.48-0.203-2.912-1.16-3.95l-0.247,1.74C215.805,465.274,215.915,465.973,215.807,466.688z"/>
+							<path fill="#988A38" d="M215.856,466.545c-0.037,0.29,0.149,0.557,0.427,0.598c0.274,0.04,0.536-0.159,0.581-0.448
+								c0.211-1.371-0.227-2.697-1.104-3.666l-0.22,1.556C215.818,465.178,215.945,465.852,215.856,466.545z"/>
+							<path fill="#988A38" d="M215.906,466.403c-0.027,0.253,0.138,0.484,0.381,0.517c0.242,0.031,0.471-0.146,0.504-0.4
+								c0.168-1.264-0.25-2.478-1.047-3.382l-0.194,1.369C215.83,465.08,215.977,465.73,215.906,466.403z"/>
+							<path fill="#988A38" d="M215.956,466.259c-0.018,0.219,0.126,0.415,0.336,0.438c0.209,0.022,0.403-0.133,0.427-0.352
+								c0.124-1.157-0.271-2.262-0.99-3.098l-0.167,1.183C215.842,464.981,216.007,465.606,215.956,466.259z"/>
+							<path fill="#988A38" d="M216.005,466.118c-0.008,0.181,0.114,0.341,0.292,0.356c0.174,0.017,0.335-0.12,0.349-0.303
+								c0.08-1.05-0.292-2.047-0.933-2.813l-0.141,0.994C215.854,464.883,216.035,465.484,216.005,466.118z"/>
+							<path fill="#988A38" d="M216.055,465.974c-0.001,0.148,0.102,0.271,0.245,0.278c0.142,0.007,0.268-0.107,0.272-0.254
+								c0.034-0.942-0.311-1.83-0.874-2.53l-0.115,0.81C215.865,464.786,216.061,465.361,216.055,465.974z"/>
+							<path fill="#FEE676" d="M216.105,465.83c0.005,0.111,0.088,0.202,0.198,0.2c0.108-0.001,0.197-0.095,0.194-0.206
+								c-0.022-0.836-0.332-1.613-0.816-2.246l-0.088,0.621C215.877,464.687,216.081,465.236,216.105,465.83z"/>
+							<path fill="#FEE676" d="M216.156,465.688c0.01,0.076,0.077,0.128,0.151,0.12c0.075-0.011,0.126-0.079,0.117-0.157
+								c-0.089-0.726-0.351-1.397-0.757-1.96l-0.062,0.432C215.889,464.59,216.083,465.115,216.156,465.688z"/>
+						</g>
+					</g>
+					<g>
+						<path d="M218.555,467.238c0.533-2.581-0.403-5.125-2.46-6.574l-0.07,0.497C217.996,462.532,219.041,464.893,218.555,467.238z"
+							/>
+					</g>
+					<path d="M218.661,465.403c-0.047-0.818-0.231-1.604-0.566-2.325c0.094,1.246-1.448,1.345-2.495,1.085l-0.101,0.712
+						C216.554,465.024,217.609,465.199,218.661,465.403z"/>
+					<path fill="#988A38" d="M230.256,473.366c0.563,0.431,1.443,0.327,2.012-0.271c0.572-0.602,0.504-1.581-0.194-2.115
+						c-2.469-1.864-5.251-3.067-8.055-3.97c-2.813-0.896-5.682-1.477-8.556-1.887l-0.421,2.971c2.748,0.391,5.465,0.945,8.064,1.773
+						C225.697,470.697,228.193,471.803,230.256,473.366z"/>
+					<path d="M231.922,471.178c0.577,0.436,0.826,1.414,0.345,1.917c0.662-0.699,0.779-1.68-0.042-2.314
+						c-2.502-1.89-5.309-3.101-8.13-4.01c-2.831-0.9-5.712-1.484-8.597-1.896l-0.07,0.495c2.864,0.408,5.72,0.987,8.515,1.877
+						C226.729,468.144,229.487,469.339,231.922,471.178z"/>
+					<g>
+						<path fill="#988A38" d="M230.512,473.313c0.305,0.236,0.776,0.17,1.067-0.162c0.292-0.333,0.26-0.835-0.083-1.101
+							c-2.312-1.792-5.007-2.98-7.752-3.869c-2.756-0.885-5.595-1.465-8.449-1.871l-0.225,1.586
+							c2.786,0.396,5.544,0.961,8.183,1.809C225.882,470.555,228.422,471.689,230.512,473.313z"/>
+						<path fill="#988A38" d="M230.226,473.021c0.282,0.206,0.7,0.137,0.951-0.171c0.247-0.303,0.208-0.75-0.103-0.977
+							c-2.279-1.676-4.902-2.801-7.574-3.651c-2.683-0.843-5.443-1.403-8.22-1.798l-0.2,1.413c2.718,0.387,5.408,0.935,7.991,1.746
+							C225.643,470.399,228.134,471.48,230.226,473.021z"/>
+						<path fill="#988A38" d="M229.929,472.735c0.255,0.175,0.622,0.105,0.834-0.173c0.209-0.273,0.164-0.666-0.113-0.856
+							c-2.243-1.567-4.794-2.633-7.395-3.442c-2.609-0.805-5.292-1.346-7.993-1.729l-0.176,1.244
+							c2.649,0.377,5.272,0.907,7.797,1.687C225.401,470.247,227.843,471.275,229.929,472.735z"/>
+						<path fill="#988A38" d="M229.623,472.456c0.226,0.146,0.542,0.078,0.717-0.167c0.174-0.243,0.127-0.58-0.115-0.737
+							c-2.205-1.463-4.683-2.476-7.212-3.245c-2.536-0.767-5.142-1.288-7.766-1.661l-0.152,1.075
+							c2.581,0.367,5.136,0.88,7.603,1.626C225.158,470.094,227.545,471.076,229.623,472.456z"/>
+						<path fill="#988A38" d="M229.309,472.182c0.194,0.12,0.46,0.055,0.602-0.155c0.14-0.209,0.093-0.491-0.111-0.618
+							c-2.164-1.365-4.57-2.323-7.028-3.057c-2.463-0.731-4.993-1.232-7.54-1.596l-0.128,0.907c2.512,0.357,5.001,0.852,7.408,1.566
+							C224.912,469.945,227.247,470.879,229.309,472.182z"/>
+						<path fill="#FEE676" d="M228.988,471.912c0.161,0.094,0.375,0.038,0.485-0.137c0.11-0.174,0.066-0.4-0.102-0.499
+							c-4.274-2.527-9.24-3.688-14.156-4.409l-0.104,0.735C219.971,468.312,224.868,469.472,228.988,471.912z"/>
+						<path fill="#FEE676" d="M228.658,471.649c0.126,0.07,0.292,0.022,0.373-0.113c0.08-0.135,0.042-0.309-0.088-0.381
+							c-4.174-2.36-8.976-3.477-13.742-4.175l-0.08,0.564C219.845,468.235,224.595,469.349,228.658,471.649z"/>
+						<path fill="#FEE676" d="M228.321,471.392c0.09,0.048,0.205,0.012,0.259-0.085c0.054-0.096,0.025-0.217-0.066-0.266
+							c-4.073-2.201-8.711-3.277-13.33-3.951l-0.056,0.396C219.719,468.154,224.321,469.229,228.321,471.392z"/>
+					</g>
+				</g>
+			</g>
+			<g>
+				<g>
+					<path fill="#988A38" d="M211.912,466.216c-0.075,0.825,0.493,1.552,1.265,1.64l1.866,0.238l0.421-2.971l-1.948-0.249
+						C212.706,464.782,211.987,465.392,211.912,466.216z"/>
+					<g>
+						<path fill="#988A38" d="M212.271,466.753c-0.043,0.44,0.258,0.83,0.671,0.876c0.711,0.078,1.42,0.167,2.127,0.268l0.224-1.586
+							c-0.724-0.103-1.45-0.193-2.176-0.273C212.694,465.99,212.314,466.313,212.271,466.753z"/>
+						<path fill="#988A38" d="M212.578,466.812c-0.04,0.392,0.229,0.741,0.599,0.783l1.902,0.242l0.2-1.413l-1.941-0.248
+							C212.959,466.134,212.619,466.418,212.578,466.812z"/>
+						<path fill="#988A38" d="M212.886,466.87c-0.037,0.345,0.198,0.654,0.524,0.692l1.677,0.216l0.176-1.245l-1.708-0.22
+							C213.223,466.274,212.923,466.523,212.886,466.87z"/>
+						<path fill="#988A38" d="M213.192,466.931c-0.033,0.298,0.17,0.567,0.452,0.601l1.451,0.189l0.152-1.075l-1.474-0.192
+							C213.486,466.419,213.225,466.631,213.192,466.931z"/>
+						<path fill="#988A38" d="M213.499,466.991c-0.029,0.253,0.141,0.479,0.378,0.51l1.226,0.162l0.128-0.907l-1.242-0.164
+							C213.748,466.562,213.528,466.74,213.499,466.991z"/>
+						<path fill="#FEE676" d="M213.805,467.055c-0.024,0.204,0.112,0.391,0.307,0.415l1,0.134l0.104-0.736l-1.011-0.136
+							C214.008,466.707,213.83,466.85,213.805,467.055z"/>
+						<path fill="#FEE676" d="M214.11,467.118c-0.02,0.16,0.086,0.303,0.236,0.322l0.773,0.105l0.08-0.565l-0.78-0.106
+							C214.268,466.854,214.13,466.959,214.11,467.118z"/>
+						<path fill="#FEE676" d="M214.415,467.184c-0.015,0.112,0.06,0.213,0.166,0.228l0.547,0.075l0.056-0.396l-0.55-0.075
+							C214.527,467,214.43,467.072,214.415,467.184z"/>
+					</g>
+					<path d="M211.912,466.216c0.062-0.687,0.904-1.171,1.576-1.094l1.94,0.248l0.07-0.495l-1.955-0.249
+						C212.597,464.52,212,465.256,211.912,466.216z"/>
+					<path fill="#988A38" d="M203.098,459.314c-1.884,0.013-3.585,0.823-4.763,2.105c-1.14,1.272-1.958,3.055-1.482,4.885
+						c0.41,1.72,1.006,3.315,2.147,4.425c1.108,1.118,2.574,1.797,4.19,1.785c2.825-0.018,5.669,0.103,8.444,0.377
+						c0.979,0.098,1.92-0.072,2.791-0.444l0.565-3.988c-0.73,0.978-1.854,1.567-3.059,1.448c-2.894-0.287-5.833-0.411-8.762-0.392
+						c-1.814,0.01-3.359-1.523-3.479-3.504c-0.122-1.981,1.432-3.686,3.429-3.697c3.176-0.02,6.345,0.113,9.525,0.427
+						c1.331,0.132,2.396,1.009,2.851,2.157l0.565-3.988c-0.886-0.624-1.954-1.038-3.118-1.154
+						C209.642,459.431,206.377,459.294,203.098,459.314z"/>
+					<path fill="#988A38" d="M203.109,460.972c-2.741,0.012-4.882,2.366-4.641,5.046c0.025,0.327,0.296,0.574,0.606,0.554
+						c0.311-0.02,0.551-0.299,0.535-0.626c-0.127-2.026,1.462-3.771,3.508-3.783c3.181-0.021,6.355,0.112,9.541,0.428
+						c1.311,0.13,2.376,0.962,2.869,2.069l0.247-1.741c-0.748-0.818-1.793-1.394-2.998-1.513
+						C209.544,461.085,206.332,460.951,203.109,460.972z"/>
+					<path fill="#988A38" d="M203.111,461.061c-2.646,0.012-4.728,2.228-4.556,4.79c0.017,0.292,0.258,0.514,0.537,0.501
+						c0.277-0.017,0.49-0.263,0.481-0.555c-0.078-1.982,1.515-3.662,3.545-3.674c3.183-0.02,6.357,0.113,9.544,0.428
+						c1.31,0.13,2.37,0.944,2.876,2.033l0.22-1.556c-0.734-0.831-1.777-1.414-2.991-1.535
+						C209.539,461.174,206.331,461.04,203.111,461.061z"/>
+					<path fill="#988A38" d="M203.111,461.149c-2.548,0.012-4.565,2.087-4.467,4.532c0.008,0.257,0.22,0.454,0.464,0.444
+						c0.244-0.008,0.432-0.222,0.429-0.475c-0.026-1.944,1.567-3.554,3.58-3.564c3.185-0.021,6.36,0.112,9.55,0.428
+						c1.308,0.13,2.363,0.922,2.883,1.992l0.194-1.369c-0.72-0.843-1.76-1.435-2.984-1.556
+						C209.533,461.263,206.327,461.129,203.111,461.149z"/>
+					<path fill="#988A38" d="M203.111,461.239c-2.45,0.013-4.401,1.946-4.378,4.276c0.001,0.219,0.182,0.394,0.392,0.39
+						c0.211-0.004,0.373-0.185,0.377-0.402c0.027-1.903,1.619-3.443,3.615-3.455c3.186-0.02,6.363,0.113,9.553,0.429
+						c1.308,0.13,2.356,0.903,2.891,1.954l0.167-1.183c-0.705-0.855-1.744-1.454-2.978-1.576
+						C209.528,461.354,206.325,461.22,203.111,461.239z"/>
+					<path fill="#988A38" d="M203.112,461.33c-2.355,0.012-4.23,1.804-4.289,4.02c-0.007,0.184,0.143,0.332,0.319,0.334
+						c0.177-0.001,0.314-0.146,0.325-0.325c0.089-1.87,1.669-3.336,3.65-3.347c3.187-0.021,6.365,0.112,9.558,0.428
+						c1.307,0.13,2.35,0.882,2.897,1.913l0.141-0.995c-0.69-0.866-1.727-1.473-2.971-1.596
+						C209.522,461.443,206.322,461.31,203.112,461.33z"/>
+					<path fill="#988A38" d="M203.113,461.419c-2.259,0.012-4.05,1.662-4.199,3.764c-0.012,0.146,0.105,0.271,0.246,0.278
+						c0.143,0.006,0.258-0.108,0.271-0.252c0.161-1.83,1.723-3.225,3.686-3.236c3.188-0.02,6.368,0.113,9.562,0.429
+						c1.306,0.13,2.343,0.862,2.904,1.876l0.115-0.81c-0.676-0.88-1.711-1.493-2.964-1.618
+						C209.517,461.532,206.32,461.398,203.113,461.419z"/>
+					<path fill="#FEE676" d="M203.113,461.508c-2.163,0.013-3.845,1.525-4.108,3.506c-0.017,0.113,0.066,0.213,0.172,0.225
+						c0.108,0.011,0.201-0.068,0.217-0.177c0.256-1.788,1.776-3.116,3.722-3.128c3.189-0.02,6.371,0.113,9.566,0.429
+						c1.305,0.13,2.335,0.843,2.912,1.837l0.088-0.621c-0.662-0.891-1.694-1.514-2.958-1.64
+						C209.512,461.621,206.317,461.488,203.113,461.508z"/>
+					<path fill="#FEE676" d="M203.114,461.598c-2.066,0.014-3.615,1.396-4.018,3.251c-0.018,0.077,0.027,0.149,0.1,0.167
+						c0.073,0.017,0.145-0.029,0.163-0.101c0.387-1.738,1.828-3.006,3.758-3.019c3.19-0.021,6.373,0.112,9.569,0.429
+						c1.305,0.13,2.33,0.822,2.919,1.797l0.062-0.432c-0.647-0.905-1.677-1.535-2.951-1.662
+						C209.507,461.711,206.315,461.578,203.114,461.598z"/>
+					<path d="M214.821,463.836c0.282,0.305,0.508,0.652,0.665,1.037l0.012,0.002l0.101-0.712c-0.32-0.081-0.598-0.198-0.771-0.339
+						L214.821,463.836z"/>
+					<path fill="#988A38" d="M203.179,470.865c-2.457,0.01-4.532-2.055-4.782-4.771c-0.033-0.44-0.409-0.772-0.848-0.727
+						c-0.443,0.042-0.767,0.469-0.664,0.91c0.413,1.741,1.005,3.303,2.145,4.411c1.102,1.11,2.558,1.785,4.161,1.774
+						c2.826-0.019,5.672,0.103,8.449,0.377c0.98,0.098,1.923-0.073,2.794-0.45l0.272-1.922c-0.86,0.575-1.855,0.886-2.908,0.781
+						C208.958,470.969,206.061,470.847,203.179,470.865z"/>
+					<path fill="#988A38" d="M203.181,470.992c-2.436,0.011-4.542-1.982-4.855-4.655c-0.039-0.393-0.372-0.684-0.757-0.638
+						c-0.388,0.042-0.655,0.419-0.564,0.807c0.189,0.847,0.456,1.635,0.781,2.353c0.344,0.714,0.803,1.349,1.346,1.878
+						c1.088,1.058,2.51,1.696,4.06,1.686c2.828-0.018,5.674,0.103,8.453,0.378c0.979,0.098,1.924-0.077,2.798-0.462l0.243-1.714
+						c-0.864,0.559-1.854,0.856-2.899,0.753C208.95,471.097,206.058,470.975,203.181,470.992z"/>
+					<path fill="#988A38" d="M203.181,471.121c-2.415,0.01-4.547-1.909-4.926-4.541c-0.044-0.343-0.335-0.594-0.666-0.55
+						c-0.335,0.043-0.551,0.372-0.472,0.71c0.192,0.812,0.428,1.573,0.776,2.255c0.351,0.682,0.804,1.285,1.341,1.787
+						c1.073,1.004,2.46,1.606,3.955,1.597c2.83-0.019,5.678,0.103,8.458,0.378c0.979,0.097,1.925-0.081,2.8-0.471l0.213-1.506
+						c-0.867,0.541-1.854,0.828-2.89,0.725C208.942,471.226,206.054,471.104,203.181,471.121z"/>
+					<path fill="#988A38" d="M203.182,471.25c-2.395,0.009-4.546-1.835-4.996-4.428c-0.046-0.295-0.297-0.504-0.576-0.463
+						c-0.283,0.045-0.455,0.328-0.385,0.618c0.601,3.121,3.061,5.369,5.964,5.358c2.831-0.019,5.681,0.103,8.462,0.378
+						c0.979,0.098,1.926-0.086,2.804-0.479l0.184-1.298c-0.871,0.522-1.854,0.799-2.88,0.697
+						C208.935,471.354,206.05,471.232,203.182,471.25z"/>
+					<path fill="#988A38" d="M203.183,471.379c-2.374,0.009-4.539-1.759-5.063-4.315c-0.048-0.244-0.259-0.416-0.488-0.374
+						c-0.232,0.044-0.364,0.285-0.303,0.527c0.633,2.989,3.072,5.087,5.861,5.078c2.833-0.019,5.684,0.102,8.467,0.378
+						c0.978,0.097,1.927-0.091,2.807-0.491l0.154-1.09c-0.874,0.506-1.853,0.771-2.871,0.67
+						C208.927,471.482,206.046,471.36,203.183,471.379z"/>
+					<path fill="#988A38" d="M203.184,471.507c-2.353,0.009-4.517-1.686-5.13-4.198c-0.047-0.198-0.22-0.33-0.4-0.287
+						c-0.185,0.042-0.282,0.239-0.228,0.437c0.692,2.853,3.085,4.802,5.763,4.792c2.833-0.018,5.686,0.103,8.472,0.379
+						c0.978,0.098,1.929-0.094,2.81-0.499l0.125-0.882c-0.878,0.491-1.852,0.741-2.861,0.641
+						C208.919,471.61,206.043,471.488,203.184,471.507z"/>
+					<path fill="#FEE676" d="M203.185,471.635c-2.33,0.01-4.472-1.622-5.195-4.084c-0.043-0.147-0.182-0.241-0.314-0.199
+						c-0.134,0.042-0.201,0.198-0.155,0.348c0.784,2.708,3.101,4.519,5.668,4.508c2.835-0.019,5.689,0.103,8.476,0.379
+						c0.977,0.097,1.931-0.099,2.813-0.506l0.096-0.676c-0.881,0.472-1.851,0.712-2.852,0.612
+						C208.912,471.738,206.04,471.617,203.185,471.635z"/>
+					<path fill="#FEE676" d="M203.186,471.764c-2.308,0.011-4.375-1.581-5.257-3.971c-0.037-0.103-0.143-0.153-0.229-0.11
+						c-0.088,0.041-0.122,0.157-0.084,0.259c0.925,2.541,3.118,4.234,5.573,4.223c2.837-0.019,5.692,0.103,8.481,0.379
+						c0.978,0.097,1.931-0.102,2.816-0.517l0.066-0.467c-0.885,0.456-1.851,0.684-2.843,0.585
+						C208.904,471.866,206.036,471.745,203.186,471.764z"/>
+					<path d="M211.956,469.656c-2.904-0.287-5.852-0.411-8.788-0.393c-1.694,0.008-3.365-1.41-3.478-3.252
+						c0.128,2.121,1.547,3.765,3.481,3.752c2.919-0.019,5.851,0.105,8.735,0.391c1.262,0.125,2.343-0.487,3.057-1.509l0.051-0.359
+						C214.268,469.214,213.101,469.77,211.956,469.656z"/>
+					<path d="M203.096,459.064c-1.958,0.014-3.664,0.856-4.826,2.185c-1.128,1.32-1.911,3.162-1.417,5.056
+						c-0.459-1.762,0.396-3.482,1.552-4.711c1.193-1.237,2.891-2.018,4.695-2.029c3.271-0.02,6.527,0.115,9.817,0.44
+						c1.14,0.113,2.212,0.529,3.107,1.156l0.07-0.497c-0.878-0.622-1.944-1.039-3.128-1.156
+						C209.657,459.181,206.385,459.044,203.096,459.064z"/>
+				</g>
+			</g>
+		</g>
+	</g>
+</g>
+<g>
+	<g>
+		<path d="M260.284,489.56l14.617-0.216v-4.176c0-2.593-0.864-4.177-2.952-5.113c-1.512-0.647-3.6-0.863-6.553-0.863
+			c-6.769,0-13.465,1.584-13.465,1.584l-0.648-3.601c0,0,5.904-1.656,14.041-1.656c4.68,0,8.064,0.433,10.369,1.729
+			c2.592,1.296,3.745,3.672,3.745,7.776v19.874c0,0.936,0.144,1.151,1.008,1.296l1.872,0.36v2.88h-7.056l-0.288-1.8
+			c-0.072-0.433-0.216-1.152-1.152-1.152c-0.576,0-1.08,0.36-1.512,0.648l-3.384,2.304c0,0-3.816,0.504-6.84,0.504
+			c-10.153,0-13.105-2.16-13.105-10.297C248.979,492.008,251.427,489.704,260.284,489.56z M274.901,492.729l-11.593,0.216
+			c-6.265,0.072-9.577-0.721-9.577,6.769c0,6.192,2.592,6.553,9.145,6.553c4.968,0,12.025-1.584,12.025-1.584V492.729z"/>
+		<g>
+			<path fill="#D5BF51" d="M259.917,490.14c4.873-0.072,9.745-0.144,14.617-0.216c0.348-0.005,1.335-0.173,1.335-0.691
+				c0-4.283,0.493-8.405-4.315-10.067c-5.792-2.003-13.973-0.236-19.763,1.09c0.369,0.087,0.737,0.175,1.105,0.262
+				c-0.216-1.2-0.432-2.4-0.648-3.601c-0.275,0.26-0.551,0.52-0.826,0.779c6.861-1.855,15.829-2.867,22.591-0.408
+				c5.376,1.954,4.455,8.413,4.455,13c0,4.585,0,9.171,0,13.756c0,0.542-0.174,2.048,0.385,2.285
+				c2.37,1.006,2.495,0.322,2.495,3.217c0.445-0.23,0.891-0.461,1.336-0.691c-1.646,0-3.291,0-4.936,0
+				c-3.229,0-0.413-0.438-2.134-2.346c-1.9-2.107-5.696,1.516-7.111,2.479c-1.113,0.758-6.649,0.387-8.887,0.302
+				c-3.43-0.131-7.076-0.784-8.76-4.095c-1.334-2.622-1.026-6.27-0.687-9.052C250.8,490.973,255.528,490.278,259.917,490.14
+				c0.755-0.023,2.037-1.201,0.734-1.16c-4.086,0.129-8.627,0.374-11.233,3.948c-2.539,3.483-1.934,10.828,0.355,14.24
+				c2.874,4.283,11.07,3.463,15.306,3.21c1.815-0.108,3.607-0.05,5.124-1.082c2.059-1.401,3.562-2.911,4.092,0.396
+				c0.039,0.24,0.409,0.321,0.599,0.321c2.352,0,4.704,0,7.056,0c0.348,0,1.336-0.176,1.336-0.691c0-0.96,0-1.92,0-2.88
+				c0-0.245-0.135-0.392-0.384-0.439c-2.364-0.455-2.496-1.243-2.496-3.283c0-2.624,0-5.247,0-7.87
+				c0-3.846,0.086-7.701-0.058-11.544c-0.459-12.295-21.856-8.639-29.205-6.651c-0.325,0.088-0.904,0.349-0.826,0.779
+				c0.216,1.2,0.432,2.4,0.648,3.601c0.078,0.432,0.836,0.323,1.105,0.262c4.787-1.097,9.723-1.578,14.628-1.497
+				c3.828,0.063,7.233,1.378,7.233,5.712c0,1.314,0,2.629,0,3.944c0.445-0.23,0.89-0.461,1.335-0.691
+				c-4.873,0.072-9.745,0.144-14.617,0.216C259.891,488.99,258.611,490.159,259.917,490.14z"/>
+			<path fill="#D5BF51" d="M275.268,492.148c-3.839,0.071-7.678,0.143-11.517,0.215c-3.099,0.058-8.686-0.765-10.229,2.94
+				c-1.746,4.191-1.274,10.114,3.745,11.24c5.634,1.265,12.248-0.125,17.773-1.342c0.245-0.054,0.829-0.304,0.829-0.632
+				c0-3.984,0-7.969,0-11.953c0-0.855-1.937-0.458-1.937,0.223c0,3.637,0,7.274,0,10.911c0,1.264-0.263,0.65,0.708,0.437
+				c-1.433,0.316-2.879,0.566-4.327,0.807c-4.069,0.675-8.555,1.193-12.625,0.28c-3.66-0.822-3.171-6.075-2.713-8.77
+				c0.654-3.844,5.405-2.933,8.262-2.986c3.766-0.069,7.532-0.14,11.297-0.21C275.293,493.295,276.573,492.124,275.268,492.148z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M313.78,509.434l-0.287-1.8c-0.072-0.433-0.217-1.152-1.225-1.152c-0.504,0-1.008,0.36-1.44,0.648l-3.456,2.304
+			c0,0-2.736,0.504-6.12,0.504c-10.945,0-12.745-7.057-12.745-17.209c0-9.793,1.728-17.21,13.321-17.21
+			c5.472,0,10.585,0.937,11.521,1.152v-10.657c0-0.936-0.144-1.152-1.008-1.296l-1.872-0.36v-2.88h7.416v43.42
+			c0,0.936,0.145,1.151,1.009,1.296l1.872,0.36v2.88H313.78z M313.349,480.055c-0.864-0.144-5.688-0.863-10.009-0.863
+			c-8.353,0-10.081,2.808-10.081,13.537c0,10.801,1.729,13.465,10.081,13.465c5.04,0,10.009-1.44,10.009-1.44V480.055z"/>
+		<g>
+			<path fill="#D5BF51" d="M314.746,509.175c-0.581-3.641-2.268-4.065-5.256-2.073c-5.341,3.561-14.478,3.178-18.064-2.804
+				c-3.302-5.506-2.624-15.643-0.708-21.535c2.885-8.871,15.265-6.816,22.308-5.345c0.464,0.097,1.305-0.238,1.305-0.807
+				c0-3.553,0-7.105,0-10.657c0-1.535-0.792-1.814-2.153-2.076c-1.279-0.246-0.727,0.253-0.727-1.446
+				c0-1.191-1.684-0.197,0.104-0.197c2.037,0,4.074,0,6.111,0c-1.085,0-0.764-0.949-0.764,1.062c0,4.07,0,8.141,0,12.211
+				c0,9.082,0,18.165,0,27.248c0,1.658-0.459,3.774,1.669,4.184c1.409,0.271,1.212-0.318,1.212,0.688c0,0.522,0,1.045,0,1.567
+				c1.067-0.42,1.216-0.592,0.446-0.518c-2.126,0-4.251,0-6.377,0c-1.14,0-1.451,1.514-0.145,1.514c2.328,0,4.656,0,6.985,0
+				c0.407,0,1.055-0.341,1.055-0.816c0-0.96,0-1.92,0-2.88c0-0.404-0.287-0.615-0.66-0.688c-3.322-0.639-2.221-3.876-2.221-6.678
+				c0-5.005,0-10.01,0-15.015c0-7.565,0-15.131,0-22.696c0-0.517-0.46-0.697-0.91-0.697c-2.472,0-4.943,0-7.416,0
+				c-0.407,0-1.055,0.341-1.055,0.816c0,0.96,0,1.92,0,2.88c0,0.404,0.287,0.615,0.66,0.687c2.995,0.577,2.22,2.763,2.22,5.305
+				c0,2.107,0,4.215,0,6.322c0.436-0.269,0.87-0.537,1.305-0.807c-6.704-1.399-15.418-2.647-21.354,1.589
+				c-5.802,4.141-5.07,14.18-4.59,20.271c0.535,6.777,3.63,11.606,10.55,12.743c2.623,0.432,7.088,0.746,9.537-0.474
+				c2.108-1.051,4.402-4.11,5.001-0.36C312.968,510.653,314.903,510.157,314.746,509.175z"/>
+			<path fill="#D5BF51" d="M313.671,479.309c-5.312-0.831-12.02-2.008-17.098,0.389c-4.561,2.153-4.262,8.981-4.296,13.091
+				c-0.034,4.073-0.31,10.93,4.19,13.002c5.105,2.351,11.887,1.113,17.074-0.324c0.328-0.091,0.791-0.376,0.791-0.772
+				c0-8.233,0-16.466,0-24.698c0-1.067-1.965-0.819-1.965,0.119c0,4.441,0,8.883,0,13.323c0,2.851,0,5.701,0,8.553
+				c0,0.795,0,1.59,0,2.385c0,0.305,1.102-0.384-1.047,0.12c-4.116,0.967-11.642,2.439-15.054-1.145
+				c-2.693-2.829-2.041-8.676-2.012-12.161c0.032-3.849,0.046-9.228,4.372-10.738c4.331-1.512,9.976-0.342,14.401,0.35
+				C314.034,480.959,314.967,479.511,313.671,479.309z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M341.14,498.488c-3.456,0-6.049-0.359-8.064-1.008c0,0-1.152,2.448-1.152,4.608c0,2.809,0.216,3.456,5.904,3.456h9.577
+			c8.856,0,11.521,1.44,11.521,9.001c0,7.057-2.592,9.217-17.569,9.217c-7.633,0-13.104-1.008-13.104-1.008l0.359-3.744
+			c0,0,6.265,1.008,12.602,1.008c2.376,0,6.696-0.072,8.784-0.576c3.169-0.648,4.393-1.656,4.393-4.824
+			c0-3.889-1.152-4.681-4.608-5.041c-1.368-0.144-3.096-0.144-4.536-0.144h-9.648c-4.104,0-8.354-0.288-8.137-5.545l0.144-1.943
+			l3.385-3.745c0.216-0.287,0.432-0.575,0.432-0.936c0-0.576-0.36-0.792-0.72-1.008c-2.593-1.585-3.601-3.457-3.601-9.289
+			c0-8.856,2.376-11.593,14.401-11.449l19.01,0.216v2.809l-6.625,0.432c-0.504,0.072-1.152,0-1.368,0.576
+			c-0.216,0.504,0,1.009,0.288,1.44l3.024,4.32c0,0,0.144,1.152,0.144,1.656C355.973,496.328,352.661,498.488,341.14,498.488z
+			 M350.429,478.975c0,0-5.329-0.216-8.929-0.216c-7.489,0-9.577,0.792-9.577,8.209c0,7.849,2.305,8.209,9.577,8.209
+			c7.2,0,9.721-0.505,9.721-8.354C351.221,484.447,351.005,481.279,350.429,478.975z"/>
+		<g>
+			<path fill="#D5BF51" d="M341.772,498.118c-2.703-0.048-5.299-0.257-7.899-0.994c-0.383-0.109-1.565,0.121-1.748,0.536
+				c-0.885,2.007-1.856,5.024-0.729,7.117c0.821,1.522,4.679,1.138,5.92,1.138c5.361,0,12.703-1.086,17.624,1.214
+				c3.064,1.432,3.05,5.674,3.005,8.499c-0.075,4.762-2.631,6.688-7.385,7.365c-5.815,0.827-12,0.495-17.821-0.091
+				c-1.197-0.121-2.392-0.286-3.578-0.484c0.083,0.014,0.381-3.31,0.415-3.666c-0.588,0.205-1.176,0.41-1.764,0.615
+				c6.447,1.002,13.014,1.27,19.519,0.769c4.87-0.375,8.194-1.776,7.968-7.095c-0.449-10.511-26.418,3.036-26.874-9.411
+				c-0.088-2.408,1.775-3.687,3.287-5.359c1.554-1.719-0.233-2.328-1.47-3.349c-2.738-2.258-2.25-7.298-2.119-10.387
+				c0.21-4.962,2.211-7.766,7.514-8.454c4.938-0.642,10.265-0.136,15.227-0.079c2.505,0.028,5.009,0.057,7.513,0.085
+				c2.016,0.023,1.168,0.645,1.168,2.715c0.464-0.206,0.928-0.412,1.393-0.617c-1.272,0.083-11.569-0.465-8.968,3.251
+				c2.132,3.045,3.164,4.263,2.96,8.073C354.522,497.065,348.362,498.062,341.772,498.118c-0.374,0.004-2.188,0.748-1.266,0.74
+				c6.418-0.056,14.427-0.265,16.09-7.813c0.947-4.3,0-6.276-2.354-9.638c-2.364-3.378,3.507-2.354,5.84-2.506
+				c0.277-0.018,1.393-0.207,1.393-0.617c0-0.937,0-1.872,0-2.809c0-0.111-0.288-0.11-0.333-0.111
+				c-8.16-0.093-16.514-0.712-24.653-0.001c-3.279,0.286-6.786,1.249-8.813,4.004c-2.713,3.686-2.301,12.035,0.137,15.545
+				c0.491,0.709,1.273,1.197,1.975,1.664c1.545,1.032-0.824,2.992-1.362,3.587c-1.511,1.672-1.769,1.78-1.932,3.984
+				c-0.416,5.625,5.445,5.656,9.473,5.656c3.97,0,14.502-1.449,16.91,1.864c0.942,1.297,0.566,4.021,0.246,5.357
+				c-0.76,3.164-8.651,2.61-11.277,2.623c-4.151,0.021-8.334-0.357-12.437-0.994c-0.37-0.058-1.715,0.108-1.764,0.615
+				c-0.12,1.248-0.239,2.496-0.359,3.744c0.036,0.062,0.092,0.095,0.168,0.098c7.939,1.405,16.665,1.676,24.608,0.222
+				c5.215-0.954,7.745-3.742,7.829-9.046c0.13-8.188-5.346-9.112-12.318-9.112c-2.997,0-5.993,0-8.99,0
+				c-1.896,0-5.055,0.455-5.625-1.907c-0.471-1.945,0.285-4.189,1.068-5.967c-0.583,0.179-1.166,0.357-1.748,0.536
+				c2.717,0.771,5.418,0.971,8.229,1.021C340.883,498.865,342.701,498.135,341.772,498.118z"/>
+			<path fill="#D5BF51" d="M351.062,478.604c-5.372-0.213-11.929-1.271-17.023,0.952c-2.016,0.88-2.647,2.863-2.943,4.88
+				c-0.376,2.559-0.699,7.359,1.396,9.377c1.707,1.645,4.196,1.648,6.432,1.717c2.957,0.091,6.096,0.168,8.96-0.688
+				c6.244-1.864,4.533-11.434,3.513-16.127c-0.08-0.368-2.021,0.104-1.932,0.518c0.854,3.927,2.257,11.462-1.06,14.635
+				c-1.709,1.635-6.023,0.952-8.216,0.923c-2.195-0.03-4.688-0.104-6.148-1.979c-2.089-2.682-1.611-9.184-0.074-11.938
+				c1.864-3.34,12.446-1.664,15.831-1.529C350.174,479.359,351.994,478.642,351.062,478.604z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M381.387,506.193c6.553,0,11.809-1.152,11.809-1.152l0.648,3.456c0,0-4.681,1.44-12.169,1.44
+			c-11.018,0-16.562-2.016-16.562-17.209c0-4.537,0.647-8.785,1.728-11.306c1.944-4.536,6.265-5.904,13.97-5.904
+			c5.977,0,10.297,0.648,12.529,3.385c1.8,2.304,2.52,5.472,2.52,11.089c0,1.296-0.071,2.52-0.144,3.888
+			c-1.225,0.072-5.761,0.576-11.161,0.576c-5.185,0-9.937-0.288-9.937-0.288l-2.232-1.512c-0.433-0.288-0.72-0.504-1.296-0.504
+			c-1.368,0-1.225,1.296-1.225,2.232c0,3.023,0.072,6.84,2.16,9.145C373.754,505.474,376.922,506.193,381.387,506.193z
+			 M381.242,479.119c-10.585,0-10.873,1.872-11.305,11.161c4.969,0.792,10.801,0.647,10.801,0.647s5.977,0.145,10.297-0.288
+			C391.035,480.199,389.02,479.119,381.242,479.119z"/>
+		<g>
+			<path fill="#D5BF51" d="M381.023,506.789c4.132-0.069,8.265-0.359,12.316-1.21c-0.37-0.093-0.74-0.187-1.11-0.279
+				c0.216,1.152,0.433,2.304,0.648,3.456c0.273-0.266,0.548-0.531,0.821-0.797c-9.129,2.683-24.03,3.769-26.908-7.652
+				c-1.729-6.857-2.097-20.529,6.043-23.354c3.875-1.345,8.527-0.949,12.533-0.624c5.109,0.414,7.958,2.86,9.068,7.745
+				c0.438,1.927,0.425,4.092,0.454,6.025c0.02,1.316,0.577,3.219-0.336,3.295c-3.848,0.322-7.689,0.479-11.552,0.455
+				c-1.704-0.012-6.402,0.65-7.854-0.333c-1.106-0.749-2.291-1.883-3.694-1.959c-4.104-0.221-2.497,6.446-2.12,8.617
+				C370.332,505.934,376.209,506.686,381.023,506.789c0.791,0.017,2.036-1.163,0.727-1.191c-6.383-0.137-10.238-2.022-10.855-8.74
+				c-0.518-5.624,0.363-2.825,3.143-2.126c3.093,0.778,7.001,0.3,10.154,0.32c3.814,0.024,7.607-0.305,11.406-0.584
+				c0.328-0.024,1.065-0.268,1.088-0.694c0.223-4.27,0.733-9.685-1.409-13.582c-2.249-4.091-6.388-4.828-10.719-5.179
+				c-9.217-0.748-17.73,0.566-19.82,10.596c-1.57,7.54-1.768,20.119,6.734,23.647c6.452,2.678,16.02,1.689,22.517-0.221
+				c0.332-0.098,0.904-0.355,0.821-0.797c-0.216-1.152-0.433-2.304-0.648-3.456c-0.083-0.442-0.827-0.339-1.11-0.279
+				c-3.705,0.777-7.525,1.031-11.301,1.095C380.97,505.611,379.719,506.811,381.023,506.789z"/>
+			<path fill="#D5BF51" d="M381.605,478.523c-3.314,0.024-7.312-0.151-10.078,2.003c-2.745,2.139-2.413,6.805-2.56,9.86
+				c-0.018,0.367,0.305,0.446,0.606,0.489c6.934,0.979,14.367,0.95,21.343,0.352c0.308-0.026,1.092-0.27,1.088-0.694
+				c-0.028-3.296,0.172-7.27-2.057-9.962C388.101,478.34,384.153,478.541,381.605,478.523c-0.787-0.005-2.034,1.183-0.727,1.191
+				c2.452,0.016,5.372-0.16,7.283,1.642c2.2,2.074,1.881,6.736,1.903,9.39c0.362-0.231,0.725-0.463,1.088-0.694
+				c-5.939,0.51-12.15,0.48-18.098-0.021c-3.286-0.277-2.014-4.664-1.318-7.066c1.062-3.673,6.179-3.228,9.142-3.249
+				C381.662,479.709,382.912,478.514,381.605,478.523z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M414.793,509.722c-3.672,0-5.544-0.576-6.696-1.729c-1.584-1.512-1.872-4.32-1.872-8.713v-19.801h-6.337v-3.528h6.337
+			v-8.93h4.536v8.93h9.577v3.528h-9.577v21.241c0,2.16,0.072,3.601,1.08,4.536c0.792,0.721,2.16,1.009,4.753,1.009
+			c1.656,0,3.672-0.36,3.672-0.36l0.433,3.168C420.698,509.073,417.098,509.722,414.793,509.722z"/>
+		<g>
+			<path fill="#D5BF51" d="M415.127,509.02c-6.973-0.198-7.862-3.569-7.924-9.812c-0.017-1.711,0-3.423,0-5.134
+				c0-4.89,0-9.778,0-14.667c0-0.507-0.467-0.634-0.893-0.634c-1.369,0-2.737,0-4.106,0c-0.685,0-1.369,0-2.053,0
+				c1.083,0,0.715,0.405,0.715-1.131c0-1.831,0.372-0.983-0.887-0.983c0.685,0,1.368,0,2.053,0c1.369,0,2.737,0,4.106,0
+				c0.379,0,1.064-0.322,1.064-0.78c0-1.806,0-3.61,0-5.415c0-1.007,0-2.014,0-3.021c0-0.134-1.543,0.286,0.721,0.286
+				c0.617,0,1.234,0,1.852,0c0.874,0.133,0.876-0.034,0.007-0.503c0,0.824,0,1.648,0,2.473c0,2.109,0,4.218,0,6.326
+				c0,0.507,0.467,0.634,0.893,0.634c1.756,0,3.512,0,5.267,0c1.105,0,2.21,0,3.314,0c1.267,0,0.104-0.791,0.104,1.131
+				c0,0.501,0,1.002,0,1.503c1.087-0.401,1.287-0.574,0.599-0.52c-0.968,0-1.937,0-2.905,0c-2.068,0-4.138,0-6.206,0
+				c-0.379,0-1.064,0.323-1.064,0.78c0,6.616,0,13.232,0,19.848c0,2.206-0.495,5.881,2.051,6.889
+				c1.902,0.754,3.931,0.717,5.947,0.627c0.395-0.041,0.79-0.082,1.185-0.124c1.033-0.009,1.16-0.108,0.38-0.299
+				c0.129,0.946,0.259,1.893,0.388,2.839c0.263-0.307,0.525-0.613,0.788-0.921c-1.87,0.324-3.745,0.53-5.642,0.604
+				c-1.097,0.043-1.488,1.465-0.172,1.414c2.077-0.081,4.122-0.34,6.169-0.693c0.431-0.075,0.851-0.467,0.788-0.921
+				c-0.145-1.056-0.288-2.112-0.433-3.168c-0.048-0.354-0.594-0.495-0.88-0.448c-2.488,0.411-7.364,1.411-8.463-2.009
+				c-0.344-1.07-0.149-3.231-0.149-4.503c0-3.87,0-7.741,0-11.612c0-1.688,0-3.376,0-5.063c0-0.564,0-1.128,0-1.692
+				c0-1.478,0.195-0.132-0.942-0.132c3.151,0,6.304,0,9.455,0c0.379,0,1.064-0.323,1.064-0.78c0-1.177,0-2.353,0-3.528
+				c0-0.507-0.467-0.634-0.893-0.634c-3.151,0-6.304,0-9.455,0c0.998,0,0.771,0.99,0.771-0.421c0-1.137,0-2.273,0-3.41
+				c0-1.488,0-2.977,0-4.465c0-0.507-0.467-0.634-0.893-0.634c-1.512,0-3.024,0-4.536,0c-0.379,0-1.064,0.322-1.064,0.78
+				c0,2.626,0,5.251,0,7.876c0,1.552-0.223,0.273,0.887,0.273c-0.685,0-1.368,0-2.053,0c-1.369,0-2.738,0-4.106,0
+				c-0.379,0-1.064,0.322-1.064,0.78c0,1.177,0,2.353,0,3.528c0,0.507,0.467,0.634,0.893,0.634c1.656,0,3.312,0,4.968,0
+				c1.607,0,0.477-1.158,0.477,0.833c0,4.4,0,8.801,0,13.201c0,3.202-0.073,6.412,0.188,9.607c0.436,5.339,4.293,6.461,9.025,6.596
+				C415.39,510.45,416.434,509.057,415.127,509.02z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M427.32,475.951h6.625l0.287,1.8c0.072,0.432,0.217,1.152,1.152,1.152c0.576,0,1.08-0.36,1.513-0.648l3.456-2.304
+			c0,0,3.96-0.433,5.904-0.433c1.872,0,3.24,0.072,4.608,0.288l-0.288,4.32c0,0-2.664-0.36-5.616-0.36
+			c-6.049,0-10.225,0.721-10.225,0.721v24.481c0,0.864,0.144,1.08,1.008,1.297l1.872,0.359v2.809H427.32v-2.88l1.872-0.36
+			c0.864-0.145,1.008-0.36,1.008-1.296v-24.482c0-0.864-0.144-1.08-1.008-1.296l-1.872-0.36V475.951z"/>
+		<g>
+			<path fill="#D5BF51" d="M426.688,476.322c1.104,0,2.209,0,3.312,0c1.923,0,2.895-0.645,3.192,1.221
+				c0.318,2,1.763,1.882,3.328,1.332c0.653-0.229,1.224-0.69,1.796-1.071c0.695-0.464,1.391-0.927,2.086-1.391
+				c1.369-0.912-1.497,0,0.624-0.213c0.903-0.091,8.909-0.662,8.858,0.104c-0.091,1.36-0.182,2.721-0.272,4.081
+				c0.588-0.206,1.176-0.411,1.764-0.617c-5.411-0.674-11.068-0.49-16.451,0.393c-0.245,0.04-1.153,0.223-1.153,0.585
+				c0,5.847,0,11.692,0,17.539c0,2.296,0,4.593,0,6.89c0,1.18,0.665,1.351,1.661,1.542c1.735,0.333,1.219,1.445,1.219,2.976
+				c0.532-0.21,1.065-0.42,1.599-0.63c-2.347,0-4.693,0-7.039,0c-3.258,0-2.925,0.422-2.925-2.768
+				c-0.385,0.195-0.77,0.39-1.153,0.585c1.944-0.375,4.033-0.32,4.033-2.698c0-3.763,0-7.525,0-11.288c0-3.98,0-7.961,0-11.94
+				c0-1.112,0.05-1.611-1.062-2.154c-1.997-0.977-1.818-0.598-1.818-3.106c0-0.264-1.932-0.001-1.932,0.518c0,0.936,0,1.872,0,2.808
+				c0.036,0.062,0.092,0.096,0.168,0.1c4.728,0.91,2.712,9.659,2.712,13.302c0,3.267,0,6.532,0,9.799
+				c0,2.003,0.568,3.568-1.727,4.01c-0.245,0.048-1.153,0.219-1.153,0.585c0,0.96,0,1.92,0,2.88c0,0.112,0.288,0.112,0.333,0.112
+				c3.433,0,6.865,0,10.297,0c0.312,0,1.599-0.163,1.599-0.63c0-0.937,0-1.872,0-2.809c-0.036-0.062-0.092-0.096-0.168-0.1
+				c-4.611-0.885-2.712-8.746-2.712-12.312c0-4.576,0-9.151,0-13.727c-0.385,0.195-0.77,0.39-1.153,0.585
+				c4.979-0.817,10.229-0.952,15.23-0.328c0.364,0.045,1.729-0.102,1.764-0.617c0.096-1.44,0.192-2.88,0.288-4.32
+				c-0.036-0.062-0.092-0.096-0.168-0.1c-3.542-0.436-7.883-0.701-11.382,0.236c-2.62,0.702-4.633,4.634-5.371,0.008
+				c-0.019-0.114-0.27-0.112-0.333-0.112c-2.209,0-4.417,0-6.625,0C427.576,475.58,425.762,476.322,426.688,476.322z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M469.01,509.938c-12.89,0-15.193-3.024-15.193-17.209c0-14.186,2.304-17.21,15.193-17.21
+			c12.889,0,15.193,3.024,15.193,17.21C484.203,506.913,481.898,509.938,469.01,509.938z M469.01,479.119
+			c-9.505,0-10.441,1.44-10.441,13.609s0.937,13.608,10.441,13.608s10.44-1.439,10.44-13.608S478.515,479.119,469.01,479.119z"/>
+		<g>
+			<path fill="#D5BF51" d="M469.629,509.516c-3.869-0.033-8.21,0.013-11.41-2.474c-2.979-2.313-3.155-7.126-3.354-10.52
+				c-0.271-4.654-0.304-9.634,0.82-14.18c1.376-5.566,6.627-6.249,11.619-6.392c4.114-0.116,9.086-0.187,12.496,2.463
+				c2.979,2.313,3.155,7.126,3.354,10.521c0.285,4.89,0.393,10.263-1.068,14.98c-1.707,5.512-7.836,5.56-12.667,5.601
+				c-0.557,0.005-2.106,0.854-0.818,0.844c4.206-0.036,8.649-0.177,12.306-2.496c3.656-2.32,3.954-7.48,4.18-11.341
+				c0.319-5.471,0.896-12.976-2.46-17.709c-2.716-3.831-8.896-3.682-12.997-3.717c-3.793-0.032-8.562-0.013-11.916,2.116
+				c-4.041,2.564-4.521,7.305-4.779,11.722c-0.319,5.47-0.896,12.975,2.459,17.708c2.717,3.832,8.897,3.682,12.998,3.717
+				C468.83,510.363,470.605,509.524,469.629,509.516z"/>
+			<path fill="#D5BF51" d="M469.419,478.697c-2.717,0.02-6.237-0.127-8.637,1.464c-2.905,1.926-2.971,5.573-3.121,8.748
+				c-0.215,4.551-1.069,11.401,1.598,15.445c1.779,2.7,6.406,2.385,9.132,2.404c2.756,0.02,6.41,0.151,8.847-1.464
+				c2.905-1.925,2.97-5.572,3.12-8.747c0.215-4.551,1.069-11.401-1.598-15.446c-1.779-2.699-6.405-2.385-9.131-2.404
+				c-0.439-0.003-2.215,0.836-1.238,0.844c2.719,0.02,5.941-0.186,8.115,1.72c1.921,1.683,1.812,5.376,1.92,7.648
+				c0.188,3.988,0.24,8.165-0.359,12.119c-0.617,4.067-3.766,4.782-7.503,4.879c-2.906,0.074-6.713,0.336-9.051-1.711
+				c-1.921-1.683-1.812-5.376-1.92-7.648c-0.188-3.988-0.24-8.165,0.359-12.119c0.695-4.584,4.685-4.859,8.648-4.888
+				C469.157,479.537,470.707,478.688,469.419,478.697z"/>
+		</g>
+	</g>
+	<g>
+		<path d="M491.327,475.951h6.625l0.287,1.8c0.072,0.432,0.217,1.152,1.152,1.152c0.576,0,1.08-0.36,1.513-0.648l3.456-2.304
+			c0,0,3.96-0.433,5.904-0.433c9.937,0,12.529,2.521,12.529,12.241v17.209c0,0.864,0.144,1.152,1.008,1.297l1.872,0.359v2.881
+			h-10.297v-2.881l1.872-0.359c0.864-0.145,1.009-0.433,1.009-1.297v-16.345c0-7.921-1.368-9.433-9.289-9.433
+			c-5.04,0-10.225,1.224-10.225,1.224v24.554c0,0.864,0.144,1.08,1.008,1.297l1.872,0.359v2.809h-10.297v-2.88l1.872-0.36
+			c0.864-0.145,1.008-0.36,1.008-1.296v-24.482c0-0.864-0.144-1.08-1.008-1.296l-1.872-0.36V475.951z"/>
+		<g>
+			<path fill="#D5BF51" d="M490.926,476.401c1.104,0,2.208,0,3.312,0c2.219,0,2.681-0.611,2.961,1.142
+				c0.307,1.922,1.746,2.051,3.355,1.433c2.157-0.829,3.35-2.579,5.627-2.79c4.666-0.433,11.415-0.998,14.255,3.6
+				c2.836,4.591,1.392,13.637,1.392,18.808c0,2.212,0,4.423,0,6.635c0,1.246,0.791,1.374,1.846,1.576
+				c1.606,0.309,1.034,1.528,1.034,2.961c0.456-0.236,0.912-0.473,1.368-0.709c-2.347,0-4.693,0-7.039,0c-1.048,0-2.096,0-3.143,0
+				c0.847,0,0.448-2.036,0.448-2.689c-0.376,0.229-0.751,0.456-1.127,0.685c3.057-0.587,4.008-0.891,4.008-4.041
+				c0-4.665,0-9.33,0-13.995c0-3.872,0.168-8.227-4.051-9.717c-4.911-1.734-11.641-0.357-16.528,0.747
+				c-0.283,0.064-0.866,0.264-0.866,0.629c0,5.863,0,11.727,0,17.591c0,2.303,0,4.606,0,6.91c0,1,0.367,1.345,1.36,1.536
+				c2.023,0.388,1.52,1.063,1.52,2.981c0.456-0.236,0.912-0.473,1.367-0.709c-2.346,0-4.692,0-7.038,0c-1.048,0-2.095,0-3.143,0
+				c0.847,0,0.448-2.035,0.448-2.688c-0.376,0.229-0.751,0.456-1.127,0.685c2.022-0.39,4.007-0.364,4.007-2.798
+				c0-3.763,0-7.525,0-11.288c0-3.98,0-7.961,0-11.94c0-1.238-0.012-1.668-1.189-2.244c-1.947-0.952-1.69-0.509-1.69-3.017
+				c0-0.429-1.932-0.115-1.932,0.518c0,0.936,0,1.872,0,2.808c0,0.141,0.274,0.171,0.353,0.187
+				c4.493,0.864,2.527,9.749,2.527,13.215c0,3.032,0,6.064,0,9.097c0,2.119,0.685,4.143-1.753,4.612
+				c-0.329,0.063-1.127,0.229-1.127,0.685c0,0.96,0,1.92,0,2.88c0,0.204,0.461,0.191,0.564,0.191c3.432,0,6.864,0,10.297,0
+				c0.368,0,1.367-0.205,1.367-0.709c0-0.937,0-1.872,0-2.809c0-0.141-0.274-0.171-0.353-0.187
+				c-4.387-0.842-2.527-8.864-2.527-12.257c0-4.589,0-9.178,0-13.767c-0.289,0.21-0.578,0.419-0.866,0.629
+				c4.669-1.055,12.363-2.802,16.59,0.269c2.398,1.744,1.858,6.377,1.858,8.95c0,3.157,0,6.313,0,9.471
+				c0,2.364,1.021,6.191-1.754,6.725c-0.329,0.063-1.127,0.229-1.127,0.685c0,0.96,0,1.921,0,2.881c0,0.204,0.461,0.191,0.563,0.191
+				c3.433,0,6.865,0,10.297,0c0.369,0,1.368-0.205,1.368-0.709c0-0.96,0-1.921,0-2.881c0-0.141-0.274-0.171-0.353-0.187
+				c-4.463-0.856-2.527-9.524-2.527-13c0-4.711,0.763-10.425-1.803-14.578c-2.77-4.483-10.627-3.669-14.972-3.317
+				c-1.185,0.097-2.543,0.032-3.569,0.716c-2.268,1.512-3.912,3.367-4.498-0.308c-0.033-0.208-0.432-0.191-0.564-0.191
+				c-2.208,0-4.416,0-6.625,0C491.135,475.501,489.631,476.401,490.926,476.401z"/>
+		</g>
+	</g>
+</g>
+</svg>
diff --git a/doc/manual/figs/architecture.png b/doc/manual/figs/architecture.png
new file mode 100644
index 0000000..1a2ec67
Binary files /dev/null and b/doc/manual/figs/architecture.png differ
diff --git a/doc/manual/figs/architecture.svg b/doc/manual/figs/architecture.svg
new file mode 100644
index 0000000..f9923ed
--- /dev/null
+++ b/doc/manual/figs/architecture.svg
@@ -0,0 +1,748 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="810"
+   height="810"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="architecture.svg">
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.1559256"
+     inkscape:cx="606.42175"
+     inkscape:cy="406.0602"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1920"
+     inkscape:window-height="1176"
+     inkscape:window-x="0"
+     inkscape:window-y="24"
+     inkscape:window-maximized="1"
+     units="in" />
+  <defs
+     id="defs4">
+    <marker
+       style="overflow:visible;"
+       id="Arrow1Mend"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4320" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mstart"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mstart">
+      <path
+         transform="scale(0.4) translate(10,0)"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4317" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Lstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Lstart">
+      <path
+         inkscape:connector-curvature="0"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         id="path3846" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Lstart">
+      <path
+         inkscape:connector-curvature="0"
+         transform="matrix(0.8,0,0,0.8,10,0)"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         id="path3828" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Lend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Lend">
+      <path
+         inkscape:connector-curvature="0"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         id="path3849" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="TriangleOutL"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutL">
+      <path
+         inkscape:connector-curvature="0"
+         transform="scale(0.8,0.8)"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         id="path3971" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Lend-0"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Lend">
+      <path
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         id="path3849-8"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Lend-2"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Lend">
+      <path
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         id="path3849-0"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow1Mendy"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mendy">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="marker-start:none;stroke:#000000;stroke-width:1.0pt;fill:#000000;fill-rule:evenodd"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path3104" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow1Mendyf"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mendyf">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="fill-rule:evenodd;marker-start:none;stroke:#0f0fff;stroke-width:1.0pt;fill:#0f0fff"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4334" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow1MendyfL"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1MendyfL">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="marker-start:none;stroke:#dcdcdc;stroke-width:1.0pt;fill:#dcdcdc;fill-rule:evenodd"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4543" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow1MendyfLA"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1MendyfLA">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="fill-rule:evenodd;marker-start:none;stroke:#c0dcdc;stroke-width:1.0pt;fill:#c0dcdc"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4741" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow1MendyfLT"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1MendyfLT">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="fill-rule:evenodd;marker-start:none;stroke:#c0dcdc;stroke-width:1.0pt;fill:#c0dcdc"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4744" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow1MendyfL2"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow1MendyfL2">
+      <path
+         transform="scale(0.4) rotate(180) translate(10,0)"
+         style="fill-rule:evenodd;marker-start:none;stroke:#c0dcdc;stroke-width:1.0pt;fill:#c0dcdc"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         id="path4747" />
+    </marker>
+  </defs>
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     transform="translate(0,-242.362)"
+     id="layer1"
+     inkscape:groupmode="layer"
+     inkscape:label="Layer 1">
+    <rect
+       y="525.13312"
+       x="443.63504"
+       height="83.438591"
+       width="84.852814"
+       id="rect2985"
+       style="opacity:1;fill:#c6eaea;fill-opacity:1;fill-rule:nonzero;stroke:#ffffff;stroke-width:0.69999999000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821"
+       y="542.81079"
+       x="457.17929"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="542.81079"
+         x="457.17929"
+         id="tspan4823"
+         sodipodi:role="line">Gadget 1</tspan></text>
+    <rect
+       y="525.13312"
+       x="543.36841"
+       height="83.438591"
+       width="84.852814"
+       id="rect2985-7"
+       style="opacity:1;fill:#c6eaea;fill-opacity:1;fill-rule:nonzero;stroke:#ffffff;stroke-width:0.69999999000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-3"
+       y="542.75134"
+       x="556.96179"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="542.75134"
+         x="556.96179"
+         id="tspan4823-1"
+         sodipodi:role="line">Gadget 2</tspan></text>
+    <rect
+       y="525.13312"
+       x="643.10162"
+       height="83.438591"
+       width="84.852814"
+       id="rect2985-7-9"
+       style="opacity:1;fill:#c6eaea;fill-opacity:1;fill-rule:nonzero;stroke:#ffffff;stroke-width:0.69999999000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-0"
+       y="542.04425"
+       x="656.56897"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="542.04425"
+         x="656.56897"
+         id="tspan4823-8"
+         sodipodi:role="line">Gadget 3</tspan></text>
+    <rect
+       y="407.16122"
+       x="190.73268"
+       height="257.52689"
+       width="239.8537"
+       id="rect2985-0"
+       style="opacity:1;fill:#c6eaea;fill-opacity:1;fill-rule:nonzero;stroke:none" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-1"
+       y="427.9028"
+       x="235.80434"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="427.9028"
+         x="235.80434"
+         id="tspan4823-2"
+         sodipodi:role="line">GadgetStreamController</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       id="rect14081"
+       width="84.852814"
+       height="27.909048"
+       x="325.49469"
+       y="439.86765" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-4"
+       y="459.42972"
+       x="339.05682"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="459.42972"
+         x="339.05682"
+         id="tspan4823-4"
+         sodipodi:role="line">Reader 1</tspan></text>
+    <rect
+       y="476.23312"
+       x="324.98962"
+       height="27.909048"
+       width="84.852814"
+       id="rect2985-8-6"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.40484342000000001;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-4-1"
+       y="494.88498"
+       x="339.10602"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="494.88498"
+         x="339.10602"
+         id="tspan4823-4-3"
+         sodipodi:role="line">Reader 2</tspan></text>
+    <rect
+       y="512.26196"
+       x="324.98962"
+       height="27.909048"
+       width="84.852814"
+       id="rect2985-8-6-0"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.40484342000000001;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-4-1-1"
+       y="530.91382"
+       x="338.97992"
+       style="font-size:12.60000038000000089px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="530.91382"
+         x="338.97992"
+         id="tspan4823-4-3-7"
+         sodipodi:role="line">Reader 3</tspan></text>
+    <rect
+       y="548.93774"
+       x="215.44989"
+       height="27.909048"
+       width="84.852814"
+       id="rect2985-8-8"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.40484342000000001;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-4-0"
+       y="567.5896"
+       x="233.00238"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="567.5896"
+         x="233.00238"
+         id="tspan4823-4-8"
+         sodipodi:role="line">Writer 1</tspan></text>
+    <rect
+       y="584.06647"
+       x="215.44989"
+       height="27.909048"
+       width="84.852814"
+       id="rect2985-8-6-9"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.40484342000000001;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-4-1-7"
+       y="602.71832"
+       x="233.05161"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="602.71832"
+         x="233.05161"
+         id="tspan4823-4-3-0"
+         sodipodi:role="line">Writer 2</tspan></text>
+    <rect
+       y="619.74835"
+       x="215.44989"
+       height="27.909048"
+       width="84.852814"
+       id="rect2985-8-6-0-8"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.40484342000000001;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-4-1-1-8"
+       y="638.40021"
+       x="232.92548"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="638.40021"
+         x="232.92548"
+         id="tspan4823-4-3-7-6"
+         sodipodi:role="line">Writer 3</tspan></text>
+    <rect
+       y="435.50378"
+       x="212.2749"
+       height="83.438591"
+       width="91.202812"
+       id="rect2985-71"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.72599999999999998;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-8"
+       y="452.6991"
+       x="230.46149"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bookshelf Symbol 7;-inkscape-font-specification:'Bookshelf Symbol 7,'"
+       xml:space="preserve"><tspan
+         y="452.6991"
+         x="230.46149"
+         id="tspan4823-41"
+         sodipodi:role="line">Message</tspan><tspan
+         id="tspan5132"
+         y="468.4491"
+         x="230.46149"
+         sodipodi:role="line">Dispatch</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-8-8"
+       y="504.66107"
+       x="217.32932"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bookshelf Symbol 7;-inkscape-font-specification:'Bookshelf Symbol 7,'"
+       xml:space="preserve"><tspan
+         id="tspan5132-9"
+         y="504.66107"
+         x="217.32932"
+         sodipodi:role="line">Message ID?</tspan></text>
+    <rect
+       y="563.71161"
+       x="321.81464"
+       height="83.438591"
+       width="91.202812"
+       id="rect2985-71-6"
+       style="opacity:1;fill:#97d2ea;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.72571987000000004;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-8-5"
+       y="580.90698"
+       x="345.36603"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bookshelf Symbol 7;-inkscape-font-specification:'Bookshelf Symbol 7,'"
+       xml:space="preserve"><tspan
+         id="tspan5132-6"
+         y="580.90698"
+         x="345.36603"
+         sodipodi:role="line">Output</tspan><tspan
+         id="tspan5417"
+         y="596.65698"
+         x="345.36603"
+         sodipodi:role="line">Queue</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-8-8-6"
+       y="632.86896"
+       x="326.86902"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bookshelf Symbol 7;-inkscape-font-specification:'Bookshelf Symbol 7,'"
+       xml:space="preserve"><tspan
+         id="tspan5132-9-3"
+         y="632.86896"
+         x="326.86902"
+         sodipodi:role="line">Message ID?</tspan></text>
+    <rect
+       y="435.50381"
+       x="67.253731"
+       height="83.438591"
+       width="84.852814"
+       id="rect2985-87"
+       style="opacity:1;fill:#c662ea;fill-opacity:1;fill-rule:nonzero;stroke:#ffffff;stroke-width:0.69999999000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-7"
+       y="469.69919"
+       x="88.316109"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="469.69919"
+         x="88.316109"
+         id="tspan4823-3"
+         sodipodi:role="line">Socket</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-7-4"
+       y="496.92044"
+       x="90.438667"
+       style="font-size:12.60000038px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="496.92044"
+         x="90.438667"
+         id="tspan4823-3-3"
+         sodipodi:role="line">TCP/IP</tspan></text>
+    <rect
+       y="533.73926"
+       x="44.306839"
+       height="130.24045"
+       width="130.7466"
+       id="rect2985-87-3-5"
+       style="opacity:1;fill:#c662ea;fill-opacity:1;fill-rule:nonzero;stroke:#ffffff;stroke-width:1.08559929999999993;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4821-7-5-2"
+       y="594.16174"
+       x="110.49312"
+       style="font-size:18px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="594.16174"
+         x="110.49312"
+         id="tspan4823-3-1-3"
+         sodipodi:role="line">CLIENT</tspan><tspan
+         id="tspan7073"
+         y="616.66174"
+         x="110.49312"
+         sodipodi:role="line">APPLICATION</tspan></text>
+    <rect
+       style="opacity:1;fill:#c6eaea;fill-opacity:1;fill-rule:nonzero;stroke:#ffffff;stroke-width:1.11457253000000001;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect10590"
+       width="268.66602"
+       height="66.809937"
+       x="449.53723"
+       y="419.26859" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4108"
+       y="455.50485"
+       x="462.42859"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:28px"
+         y="455.50485"
+         x="462.42859"
+         id="tspan4110"
+         sodipodi:role="line">Shared Toolboxes</tspan></text>
+    <path
+       inkscape:connector-curvature="3"
+       inkscape:connector-type="polyline"
+       id="path4112"
+       d="m 109.68014,533.73926 0,-14.79687"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow1Mstart);marker-mid:none;marker-end:url(#Arrow1Mend)" />
+    <path
+       inkscape:connector-curvature="3"
+       inkscape:connector-type="polyline"
+       id="path4114"
+       d="m 151.0964,475.2028 60.16836,-1e-5"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:none;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path5948"
+       d="m 694.21429,634.21914 -279.73054,0"
+       style="fill:none;stroke:#000000;stroke-width:0.99685216px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path5954"
+       d="m 693.65718,608.32883 0,23.94651"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path8722"
+       d="m 588.6664,365.53288 0,24.74873"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 1;stroke-dashoffset:0;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 1;stroke-dashoffset:0;marker-end:url(#Arrow1Mend)"
+       d="m 483.10546,608.65249 0,23.99112"
+       id="path8968"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="ccc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10592"
+       d="m 421.48615,213.31551 0,124.86871 22.22336,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10790"
+       d="m 409.61686,213.31551 11.86929,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10792"
+       d="m 409.61686,246.71369 11.86929,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10794"
+       d="m 409.8694,284.72068 11.61675,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10800"
+       d="m 303.55084,246.46115 9.97525,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <path
+       sodipodi:nodetypes="cccc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10796"
+       d="m 323.37508,282.82664 -12.43492,-0.21628 0,-72.73098 12.43492,0.21628"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow1Mstart);marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path10798"
+       d="m 313.52609,246.46115 9.84899,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 321.24051,599.22561 -7.9703,0"
+       id="path11782"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 313.25336,599.22561 -11.06721,0"
+       id="path11786"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 204.27544,629.80732 0,-125.06952 -51.77032,0"
+       id="path11788"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 216.14473,629.80732 -11.86929,0"
+       id="path11790"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 216.14473,596.40914 -11.86929,0"
+       id="path11792"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 215.89219,558.40215 -11.61675,0"
+       id="path11794"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path11796"
+       d="m 528.21429,338.18422 13.57143,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path11798"
+       d="m 627.85714,338.18422 13.92858,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path12390"
+       d="m 482.0953,485.43419 0,35.35534"
+       style="stroke-linejoin:miter;marker-end:url(#Arrow1MendyfLA);stroke-opacity:1;stroke:#c0dcdc;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:3.10000000000000009;fill:none" />
+    <path
+       style="stroke-linejoin:miter;marker-end:url(#Arrow1MendyfLT);stroke-opacity:1;stroke:#c0dcdc;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:3.10000000000000009;fill:none"
+       d="m 586.64609,485.4342 0,35.35533"
+       id="path4730"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path4732"
+       d="m 686.14612,485.43419 0,35.35534"
+       style="stroke-linejoin:miter;marker-end:url(#Arrow1MendyfL2);stroke-opacity:1;stroke:#c0dcdc;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:3.10000000000000009;fill:none" />
+    <path
+       sodipodi:nodetypes="ccc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path4753"
+       d="m 315.83503,393.22325 0,-71.72158 -13.64888,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       transform="translate(0,242.362)"
+       inkscape:connector-curvature="0"
+       id="path4957"
+       d="m 315.83503,393.22325 -13.64888,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <rect
+       y="335.89279"
+       x="180.4389"
+       height="348.21826"
+       width="560.12915"
+       id="rect4135"
+       style="fill:none;stroke:#000000;stroke-width:2.16550922;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:2.16550904, 2.16550904;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text4711"
+       y="382.76871"
+       x="356.42432"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="382.76871"
+         x="356.42432"
+         id="tspan4713"
+         sodipodi:role="line">Gadgetron</tspan></text>
+  </g>
+</svg>
diff --git a/doc/manual/figs/arrayfileformat.png b/doc/manual/figs/arrayfileformat.png
new file mode 100644
index 0000000..c211c2f
Binary files /dev/null and b/doc/manual/figs/arrayfileformat.png differ
diff --git a/doc/manual/figs/arrayfileformat.svg b/doc/manual/figs/arrayfileformat.svg
new file mode 100644
index 0000000..6a587e2
--- /dev/null
+++ b/doc/manual/figs/arrayfileformat.svg
@@ -0,0 +1,247 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="744.09448819"
+   height="1052.3622047"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="arrayfileformat.svg"
+   inkscape:export-filename="/home/hansenms/mrprogs/gadgetron/doc/manual/figs/arrayfileformat.png"
+   inkscape:export-xdpi="300"
+   inkscape:export-ydpi="300">
+  <defs
+     id="defs4" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.7"
+     inkscape:cx="-13.284581"
+     inkscape:cy="808.61822"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1746"
+     inkscape:window-height="967"
+     inkscape:window-x="64"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0" />
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1">
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:3.0999999;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect3775"
+       width="82.85714"
+       height="211.42857"
+       x="111.42857"
+       y="40.933613" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:5, 30;stroke-dashoffset:0"
+       d="m 153.57142,263.79075 0,192.85714"
+       id="path3777"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 112.85714,83.362147 80,0"
+       id="path3779"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="140.27902"
+       y="75.505035"
+       id="text3781"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3783"
+         x="140.27902"
+         y="75.505035">4</tspan></text>
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path3785"
+       d="m 112.85714,125.64792 80,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 112.85714,167.93364 80,0"
+       id="path3787"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path3789"
+       d="m 112.85714,210.21929 80,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <text
+       xml:space="preserve"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="113.8337"
+       y="119.79939"
+       id="text3781-9"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3783-4"
+         x="113.8337"
+         y="119.79939">128</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3816"
+       y="164.08511"
+       x="113.8337"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="164.08511"
+         x="113.8337"
+         id="tspan3818"
+         sodipodi:role="line">128</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3820"
+       y="202.6479"
+       x="139.78098"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="202.6479"
+         x="139.78098"
+         id="tspan3822"
+         sodipodi:role="line">1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="139.78098"
+       y="245.50505"
+       id="text3824"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3826"
+         x="139.78098"
+         y="245.50505">1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="-384.66272"
+       y="140.69617"
+       id="text3828"
+       sodipodi:linespacing="125%"
+       transform="matrix(0,-1,1,0,0,0)"><tspan
+         sodipodi:role="line"
+         id="tspan3830"
+         x="-384.66272"
+         y="140.69617">Data</tspan></text>
+    <text
+       transform="matrix(0,-1,1,0,0,0)"
+       sodipodi:linespacing="125%"
+       id="text3832"
+       y="99.267601"
+       x="-224.66272"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="99.267601"
+         x="-224.66272"
+         id="tspan3834"
+         sodipodi:role="line">Header</tspan></text>
+    <text
+       transform="matrix(0,-1,1,0,0,0)"
+       sodipodi:linespacing="125%"
+       id="text3836"
+       y="179.26761"
+       x="-442.37701"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="179.26761"
+         x="-442.37701"
+         id="tspan3838"
+         sodipodi:role="line"
+         style="font-size:20px">(16384 elements)</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="208.87842"
+       y="69.505043"
+       id="text3859"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3861"
+         x="208.87842"
+         y="69.505043"
+         style="font-size:22px">N-dimensions</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3863"
+       y="113.21933"
+       x="200.39746"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:22px"
+         y="113.21933"
+         x="200.39746"
+         id="tspan3865"
+         sodipodi:role="line">length of dim 1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="200.4834"
+       y="156.07648"
+       id="text3867"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3869"
+         x="200.4834"
+         y="156.07648"
+         style="font-size:22px">length of dim 2</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3871"
+       y="196.6479"
+       x="200.26318"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:22px"
+         y="196.6479"
+         x="200.26318"
+         id="tspan3873"
+         sodipodi:role="line">length of dim 3</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3879"
+       y="237.21933"
+       x="200"
+       style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:22px"
+         y="237.21933"
+         x="200"
+         id="tspan3881"
+         sodipodi:role="line">length of dim 4</tspan></text>
+  </g>
+</svg>
diff --git a/doc/manual/figs/cgsense.png b/doc/manual/figs/cgsense.png
new file mode 100644
index 0000000..dcfe43d
Binary files /dev/null and b/doc/manual/figs/cgsense.png differ
diff --git a/doc/manual/figs/cgsense.svg b/doc/manual/figs/cgsense.svg
new file mode 100644
index 0000000..345d6c9
--- /dev/null
+++ b/doc/manual/figs/cgsense.svg
@@ -0,0 +1,671 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="225"
+   height="540"
+   id="svg7000"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="cgsense.svg">
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.9404392"
+     inkscape:cx="26.752607"
+     inkscape:cy="293.58724"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     units="in"
+     inkscape:window-width="1440"
+     inkscape:window-height="793"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0" />
+  <defs
+     id="defs7002">
+    <marker
+       style="overflow:visible;"
+       id="Arrow2Mend"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mend">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         id="path5505" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Lend"
+       style="overflow:visible;">
+      <path
+         id="path6118"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
+         transform="scale(0.8) rotate(180) translate(12.5,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Lend"
+       style="overflow:visible;">
+      <path
+         id="path6136"
+         style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(1.1) rotate(180) translate(1,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path6124"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2Mendw"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mendw">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#000000;stroke-width:0.62500000;fill:#000000"
+         id="path8552" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2MendwQ"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2MendwQ">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#c8c8c8;stroke-width:0.62500000;fill:#c8c8c8"
+         id="path8732" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2MendwQw"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2MendwQw">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#b4b4b4;stroke-width:0.62500000;fill:#b4b4b4"
+         id="path8915" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2MendwQwi"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2MendwQwi">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#828282;stroke-width:0.62500000;fill:#828282"
+         id="path9106" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2MendwQwK"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2MendwQwK">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#000000;stroke-width:0.62500000;fill:#000000"
+         id="path4159" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2MendwQwif"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2MendwQwif">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#000000;stroke-width:0.62500000;fill:#000000"
+         id="path4162" />
+    </marker>
+    <marker
+       style="overflow:visible;"
+       id="Arrow2MendwQwo"
+       refX="0.0"
+       refY="0.0"
+       orient="auto"
+       inkscape:stockid="Arrow2MendwQwo">
+      <path
+         transform="scale(0.6) rotate(180) translate(0,0)"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         style="stroke-linejoin:round;font-size:12.0;fill-rule:evenodd;stroke:#000000;stroke-width:0.62500000;fill:#000000"
+         id="path4165" />
+    </marker>
+  </defs>
+  <metadata
+     id="metadata7005">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     transform="translate(0,-512.35975)"
+     id="layer1"
+     inkscape:groupmode="layer"
+     inkscape:label="Layer 1">
+    <rect
+       y="523.48932"
+       x="66.106224"
+       height="31.714287"
+       width="104.11561"
+       id="rect5207"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5209"
+       y="533.77509"
+       x="83.459534"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="533.77509"
+         x="83.459534"
+         id="tspan5211"
+         sodipodi:role="line">NoiseAdjustGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5721"
+       y="549.6322"
+       x="91.235802"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:6px"
+         y="549.6322"
+         x="91.235802"
+         id="tspan5723"
+         sodipodi:role="line">Noise Prewitening</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5725"
+       width="104.11561"
+       height="31.714287"
+       x="66.106224"
+       y="570.19067" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="97.613441"
+       y="580.47638"
+       id="text5727"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5729"
+         x="97.613441"
+         y="580.47638">PCAGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="157.30838"
+       y="858.38953"
+       id="text5731"
+       sodipodi:linespacing="125%"><tspan
+         id="tspan5737"
+         sodipodi:role="line"
+         x="157.30838"
+         y="858.38953"
+         style="font-size:10px;text-align:center;text-anchor:middle">Downstream Gadgets</tspan><tspan
+         sodipodi:role="line"
+         x="157.30838"
+         y="870.88953"
+         style="font-size:10px;text-align:center;text-anchor:middle"
+         id="tspan11864">Image Scaling, etc.</tspan></text>
+    <rect
+       y="616.89203"
+       x="66.106224"
+       height="31.714287"
+       width="104.11561"
+       id="rect5768"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5770"
+       y="625.89203"
+       x="80.130241"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="625.89203"
+         x="80.130241"
+         id="tspan5772"
+         sodipodi:role="line">CoilReductionGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5774"
+       y="637.03485"
+       x="118.01754"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="637.03485"
+         x="118.01754"
+         sodipodi:role="line"
+         id="tspan5778">Reduce Channels</tspan><tspan
+         id="tspan5782"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="644.53485"
+         x="118.01754"
+         sodipodi:role="line">e.g 32 -> 16</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="116.98685"
+       y="590.65363"
+       id="text11971"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="116.98685"
+         y="590.65363"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan11975">Virtual Channels</tspan><tspan
+         sodipodi:role="line"
+         x="116.98685"
+         y="598.15363"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan11979">Principal Component Analysis</tspan></text>
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path7323"
+       d="m 118.16403,555.55174 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 118.16403,602.12825 0,12.85714"
+       id="path7511"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path7513"
+       d="m 118.16403,648.78769 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.72486401000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect7752"
+       width="210.44579"
+       height="172.81833"
+       x="9.1387424"
+       y="664.21027" />
+    <rect
+       y="686.64551"
+       x="23.571426"
+       height="43.214283"
+       width="183.57141"
+       id="rect7770"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       y="699.21692"
+       x="27.5"
+       height="22.857143"
+       width="7.5"
+       id="rect8298"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="49.807228"
+       y="694.1322"
+       id="text8304"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan8306"
+         x="49.807228"
+         y="694.1322"
+         style="font-size:6px">Circular Data Buffer</tspan></text>
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8315"
+       width="7.5"
+       height="22.857143"
+       x="38.547619"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="49.595238"
+       height="22.857143"
+       width="7.5"
+       id="rect8317"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8319"
+       width="7.5"
+       height="22.857143"
+       x="60.642849"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="71.690475"
+       height="22.857143"
+       width="7.5"
+       id="rect8321"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8323"
+       width="7.5"
+       height="22.857143"
+       x="82.738098"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="93.785713"
+       height="22.857143"
+       width="7.5"
+       id="rect8325"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8327"
+       width="7.5"
+       height="22.857143"
+       x="104.83333"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="115.88095"
+       height="22.857143"
+       width="7.5"
+       id="rect8329"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8331"
+       width="7.5"
+       height="22.857143"
+       x="126.92857"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="137.9762"
+       height="22.857143"
+       width="7.5"
+       id="rect8333"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8335"
+       width="7.5"
+       height="22.857143"
+       x="149.0238"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="160.07143"
+       height="22.857143"
+       width="7.5"
+       id="rect8337"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8339"
+       width="7.5"
+       height="22.857143"
+       x="171.11905"
+       y="699.21692" />
+    <rect
+       y="699.21692"
+       x="182.16666"
+       height="22.857143"
+       width="7.5"
+       id="rect8341"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect8343"
+       width="7.5"
+       height="22.857143"
+       x="193.21428"
+       y="699.21692" />
+    <rect
+       y="690.43115"
+       x="146.42857"
+       height="35"
+       width="57.857143"
+       id="rect8347"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path8349"
+       d="m 41.071429,729.7169 -0.357143,21.60715"
+       style="stroke-linejoin:miter;marker-end:url(#Arrow2MendwQwK);stroke-opacity:1;stroke:#000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:2.50000000000000000;fill:none" />
+    <path
+       style="stroke-linejoin:miter;marker-end:url(#Arrow2MendwQwo);stroke-opacity:1;stroke:#000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:2.50000000000000000;fill:none"
+       d="m 94.071429,729.7169 -0.357143,21.60715"
+       id="path9084"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9086"
+       d="m 177.5,725.43119 -0.35714,25.89286"
+       style="stroke-linejoin:miter;marker-end:url(#Arrow2MendwQwif);stroke-opacity:1;stroke:#000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:2.50000000000000000;fill:none" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect9283"
+       width="48.214287"
+       height="23.571428"
+       x="17.050394"
+       y="755.28833" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text9287"
+       y="765.06079"
+       x="41.078438"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="765.06079"
+         x="41.078438"
+         id="tspan9289"
+         sodipodi:role="line">Coil Sensitivity</tspan><tspan
+         id="tspan9291"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="772.56079"
+         x="41.078438"
+         sodipodi:role="line">Map (B1 map)</tspan></text>
+    <rect
+       y="755.28833"
+       x="69.764679"
+       height="23.571428"
+       width="48.214287"
+       id="rect9301"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="93.792725"
+       y="765.06079"
+       id="text9303"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="93.792725"
+         y="765.06079"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan9307">Regularization</tspan><tspan
+         sodipodi:role="line"
+         x="93.792725"
+         y="772.56079"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan9311">Mask</tspan></text>
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect9313"
+       width="48.214287"
+       height="23.571428"
+       x="154.40753"
+       y="755.28833" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text9315"
+       y="765.56085"
+       x="178.47806"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan9319"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="765.56085"
+         x="178.47806"
+         sodipodi:role="line">Undersampled</tspan><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="773.06085"
+         x="178.47806"
+         sodipodi:role="line"
+         id="tspan10825">Data Frame</tspan></text>
+    <path
+       sodipodi:nodetypes="cccc"
+       inkscape:connector-curvature="0"
+       id="path10625"
+       d="m 118.21429,664.45068 0,18.30193 78.75,0 0,15.17856"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
+    <rect
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none"
+       id="rect10831"
+       width="85.000008"
+       height="23.571428"
+       x="45.836105"
+       y="801.21692" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text10833"
+       y="811.48944"
+       x="87.007004"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan10837"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="811.48944"
+         x="87.007004"
+         sodipodi:role="line">Conjugate Gradient Solver</tspan><tspan
+         id="tspan11996"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="818.98944"
+         x="87.007004"
+         sodipodi:role="line">GPU Based</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 42.321428,779.0026 0,9.68751 52.053572,0 0,11.38392"
+       id="path10841"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 180.35714,779.0026 0,9.68751 -111.785711,0"
+       id="path11039"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 94.375,779.0026 0,9.82144"
+       id="path11041"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <rect
+       y="801.21692"
+       x="143.82033"
+       height="23.571428"
+       width="27.500006"
+       id="rect11049"
+       style="fill:#000000;fill-opacity:0.19607843;stroke:none" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="157.43556"
+       y="814.56561"
+       id="text11051"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="157.43556"
+         y="814.56561"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan11053">Image</tspan></text>
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path11065"
+       d="m 130.76209,813.00263 11.74695,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-mid:none;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path7521"
+       d="m 157.27953,825.07986 0,23.26505"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <text
+       transform="translate(0,512.35975)"
+       sodipodi:linespacing="125%"
+       id="text11990"
+       y="290.05658"
+       x="126.77542"
+       style="font-size:12px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan11992"
+         sodipodi:role="line" /></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text11998"
+       y="678.21765"
+       x="12.88368"
+       style="font-size:12px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="678.21765"
+         x="12.88368"
+         id="tspan12000"
+         sodipodi:role="line">CGSenseGadget</tspan></text>
+  </g>
+</svg>
diff --git a/doc/manual/figs/examplecgsenseresult.png b/doc/manual/figs/examplecgsenseresult.png
new file mode 100644
index 0000000..6745b60
Binary files /dev/null and b/doc/manual/figs/examplecgsenseresult.png differ
diff --git a/doc/manual/figs/examplegrapparesult.png b/doc/manual/figs/examplegrapparesult.png
new file mode 100644
index 0000000..b2d0ca3
Binary files /dev/null and b/doc/manual/figs/examplegrapparesult.png differ
diff --git a/doc/manual/figs/examplelibresult.png b/doc/manual/figs/examplelibresult.png
new file mode 100644
index 0000000..21cfd1e
Binary files /dev/null and b/doc/manual/figs/examplelibresult.png differ
diff --git a/doc/manual/figs/gadget.png b/doc/manual/figs/gadget.png
new file mode 100644
index 0000000..48548bb
Binary files /dev/null and b/doc/manual/figs/gadget.png differ
diff --git a/doc/manual/figs/gadget.svg b/doc/manual/figs/gadget.svg
new file mode 100644
index 0000000..7aafdca
--- /dev/null
+++ b/doc/manual/figs/gadget.svg
@@ -0,0 +1,573 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="225"
+   height="540"
+   id="svg8632"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="gadget.svg"
+   inkscape:export-filename="/home/hansenms/mrprogs/gadgetron/doc/manual/figs/gadget.png"
+   inkscape:export-xdpi="299.97357"
+   inkscape:export-ydpi="299.97357">
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="1"
+     inkscape:pageshadow="2"
+     inkscape:zoom="2.8"
+     inkscape:cx="76.089576"
+     inkscape:cy="420.2177"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1920"
+     inkscape:window-height="1176"
+     inkscape:window-x="0"
+     inkscape:window-y="24"
+     inkscape:window-maximized="1"
+     units="in" />
+  <defs
+     id="defs8634">
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend"
+       style="overflow:visible">
+      <path
+         id="path3849"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutL"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutL-9"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path4614-0"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="scale(0.8,0.8)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Mend"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path3972"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend-6"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path3849-4"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutL-9B"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutL-9B"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path5674"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="marker-start:none;stroke:#7dbbc1;stroke-width:1pt;fill:#7dbbc1;fill-rule:evenodd"
+         transform="scale(0.8,0.8)" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutL-9Bp"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutL-9Bp"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path5828"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="fill-rule:evenodd;marker-start:none;stroke:#053df0;stroke-width:1pt;fill:#053df0"
+         transform="scale(0.8,0.8)" />
+    </marker>
+  </defs>
+  <metadata
+     id="metadata8637">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(0,-512.35975)">
+    <path
+       style="opacity:0.29999999999999999;stroke-linejoin:miter;marker-end:url(#TriangleOutL-9Bp);stroke-opacity:1;stroke:#053df0;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;stroke-width:1.16955005999999995;fill:none"
+       d="m 144.9696,545.92479 c 105.22119,20.68554 72.64169,110.52695 -4.16809,114.10406 C 53.424124,659.61535 9.1283266,578.98608 115.88548,545.62781"
+       id="path4372"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccc"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="571.45758" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="580.67151" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1-9"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="591.72815" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1-9-1"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="601.55634" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1-9-6"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="610.7702" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1-9-7"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="620.59839" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1-9-7-4"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="631.65503" />
+    <rect
+       style="opacity:0.93999999000000001;fill:#0085cb;fill-opacity:1;stroke:#000000;stroke-width:0.42998171000000002;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.85996340000000004, 0.42998170000000002;stroke-dashoffset:0"
+       id="rect9203-1-9-7-4-1"
+       width="25.184643"
+       height="6.1425958"
+       x="16.957426"
+       y="643.94025" />
+    <rect
+       style="opacity:0.93999999;fill:none;stroke:#000000;stroke-width:0.51803416;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1.03606836, 0.51803418;stroke-dashoffset:0"
+       id="rect9696"
+       width="39.345463"
+       height="90.463341"
+       x="9.8770142"
+       y="565.91284" />
+    <text
+       xml:space="preserve"
+       style="font-size:4.72979879px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="10.547411"
+       y="561.62946"
+       id="text9698"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan9700"
+         x="10.547411"
+         y="561.62946">Message Queue</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
+       d="m 42.770819,574.65177 43.740043,0"
+       id="path9715"
+       inkscape:connector-curvature="0"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="56.739758"
+       y="571.4574"
+       id="text9905"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9907"
+         x="56.739758"
+         y="571.4574"
+         style="font-size:5.73308945px">Dequeue</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:3.29999995px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="-0.90061849"
+       y="609.21912"
+       id="text9939"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9941"
+         x="-0.90061849"
+         y="609.21912" /></text>
+    <text
+       xml:space="preserve"
+       style="font-size:13.75941467px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       x="81.228851"
+       y="534.04047"
+       id="text9943"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9945"
+         x="81.228851"
+         y="534.04047">Gadget</tspan></text>
+    <rect
+       style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none"
+       id="rect9947-1"
+       width="27.363867"
+       height="13.464759"
+       x="88.395607"
+       y="585.8028"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <g
+       id="g9982"
+       transform="matrix(0.4299817,0,0,0.4299817,-85.56041,532.0746)"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741">
+      <text
+         sodipodi:linespacing="125%"
+         id="text9949-9"
+         y="137.66905"
+         x="436.13882"
+         style="font-size:11px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           y="137.66905"
+           x="436.13882"
+           id="tspan9951-3"
+           sodipodi:role="line">Message</tspan><tspan
+           id="tspan9953-3"
+           y="151.41905"
+           x="436.13882"
+           sodipodi:role="line">Block</tspan></text>
+    </g>
+    <rect
+       style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none"
+       id="rect9947-1-8"
+       width="27.363867"
+       height="13.464759"
+       x="88.395607"
+       y="603.90387"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <g
+       id="g9982-9"
+       transform="matrix(0.4299817,0,0,0.4299817,-85.56041,550.17533)"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741">
+      <text
+         sodipodi:linespacing="125%"
+         id="text9949-9-5"
+         y="137.66905"
+         x="436.13882"
+         style="font-size:11px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           y="137.66905"
+           x="436.13882"
+           id="tspan9951-3-5"
+           sodipodi:role="line">Message</tspan><tspan
+           id="tspan9953-3-1"
+           y="151.41905"
+           x="436.13882"
+           sodipodi:role="line">Block</tspan></text>
+    </g>
+    <rect
+       style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none"
+       id="rect9947-1-8-4"
+       width="27.363867"
+       height="13.464759"
+       x="88.395607"
+       y="622.00421"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <g
+       id="g9982-9-2"
+       transform="matrix(0.4299817,0,0,0.4299817,-85.56041,568.27603)"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741">
+      <text
+         sodipodi:linespacing="125%"
+         id="text9949-9-5-9"
+         y="137.66905"
+         x="436.13882"
+         style="font-size:11px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           y="137.66905"
+           x="436.13882"
+           id="tspan9951-3-5-4"
+           sodipodi:role="line">Message</tspan><tspan
+           id="tspan9953-3-1-1"
+           y="151.41905"
+           x="436.13882"
+           sodipodi:role="line">Block</tspan></text>
+    </g>
+    <rect
+       style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none"
+       id="rect9947-1-8-41"
+       width="27.363867"
+       height="13.464759"
+       x="88.395607"
+       y="567.70215"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <g
+       id="g9982-9-8"
+       transform="matrix(0.4299817,0,0,0.4299817,-85.56041,513.97391)"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741">
+      <text
+         sodipodi:linespacing="125%"
+         id="text9949-9-5-1"
+         y="137.66905"
+         x="436.13882"
+         style="font-size:11px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           y="137.66905"
+           x="436.13882"
+           id="tspan9951-3-5-7"
+           sodipodi:role="line">Message</tspan><tspan
+           id="tspan9953-3-1-9"
+           y="151.41905"
+           x="436.13882"
+           sodipodi:role="line">Block</tspan></text>
+    </g>
+    <rect
+       style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none"
+       id="rect9947-1-8-41-5"
+       width="53.42469"
+       height="13.464759"
+       x="136.60361"
+       y="593.30469"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <text
+       xml:space="preserve"
+       style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="145.20862"
+       y="601.80029"
+       id="text9905-5"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9907-6"
+         x="145.20862"
+         y="601.80029">Data Validation</tspan></text>
+    <rect
+       style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none"
+       id="rect9947-1-8-41-5-1"
+       width="53.42469"
+       height="13.464759"
+       x="136.60361"
+       y="621.99536"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <text
+       xml:space="preserve"
+       style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="144.2594"
+       y="630.03259"
+       id="text9905-5-8"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9907-6-9"
+         x="144.2594"
+         y="630.03259">Data Processing</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend-6)"
+       d="m 115.75949,580.15476 31.4533,13.14992"
+       id="path10174"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend-6)"
+       d="m 115.75949,594.21141 20.84413,2.55339"
+       id="path10176"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend-6)"
+       d="m 115.75949,608.26805 20.84413,-3.6077"
+       id="path10178"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend-6)"
+       d="m 115.75949,622.32465 33.1912,-15.55524"
+       id="path10180"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend-6)"
+       d="m 163.31598,606.76941 0,15.22619"
+       id="path10182"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <text
+       xml:space="preserve"
+       style="font-size:6.87970734px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       x="119.88577"
+       y="572.26294"
+       id="text10608"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan10610"
+         x="119.88577"
+         y="572.26294">Execution Thread(s)</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.42998168;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend-6)"
+       d="m 29.753802,671.11779 0,-20.47533"
+       id="path13563"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <text
+       xml:space="preserve"
+       style="font-size:17.19926834px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="44.491711"
+       y="672.14148"
+       id="text13753"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         x="44.491711"
+         y="672.14148"
+         id="tspan13757"
+         style="font-size:5.73308945px;text-align:center;text-anchor:middle">Enqueue</tspan><tspan
+         sodipodi:role="line"
+         x="44.491711"
+         y="679.30786"
+         id="tspan13761"
+         style="font-size:5.73308945px;text-align:center;text-anchor:middle">(From Upstream Gadget)</tspan></text>
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path13776"
+       d="m 163.48324,635.44182 0,26.36199"
+       style="fill:none;stroke:#000000;stroke-width:0.42998168;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend-6)"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text13778"
+       y="671.62988"
+       x="160.68909"
+       style="font-size:17.19926834px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         style="font-size:5.73308945px;text-align:center;text-anchor:middle"
+         id="tspan13782"
+         y="671.62988"
+         x="160.68909"
+         sodipodi:role="line">Pass on data</tspan><tspan
+         style="font-size:5.73308945px;text-align:center;text-anchor:middle"
+         y="678.79626"
+         x="160.68909"
+         sodipodi:role="line"
+         id="tspan13786">to downstream Gadget</tspan></text>
+  </g>
+</svg>
diff --git a/doc/manual/figs/grappa.png b/doc/manual/figs/grappa.png
new file mode 100644
index 0000000..ca9e219
Binary files /dev/null and b/doc/manual/figs/grappa.png differ
diff --git a/doc/manual/figs/grappa.svg b/doc/manual/figs/grappa.svg
new file mode 100644
index 0000000..2835440
--- /dev/null
+++ b/doc/manual/figs/grappa.svg
@@ -0,0 +1,594 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="270"
+   height="540"
+   id="svg5199"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="grappa.svg"
+   inkscape:export-filename="/home/hansenms/mrprogs/gadgetron/doc/manual/figs/grappa.png"
+   inkscape:export-xdpi="299.97357"
+   inkscape:export-ydpi="299.97357">
+  <defs
+     id="defs5201">
+    <marker
+       inkscape:stockid="Arrow2Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Mend"
+       style="overflow:visible;">
+      <path
+         id="path7268"
+         style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(0.6) rotate(180) translate(0,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path6124"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Lend"
+       style="overflow:visible;">
+      <path
+         id="path6136"
+         style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(1.1) rotate(180) translate(1,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Lend"
+       style="overflow:visible;">
+      <path
+         id="path6118"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
+         transform="scale(0.8) rotate(180) translate(12.5,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.979899"
+     inkscape:cx="189.05381"
+     inkscape:cy="284.36943"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     units="in"
+     inkscape:window-width="1629"
+     inkscape:window-height="1147"
+     inkscape:window-x="474"
+     inkscape:window-y="109"
+     inkscape:window-maximized="0" />
+  <metadata
+     id="metadata5204">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(0,-512.35975)">
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5207"
+       width="104.11561"
+       height="31.714287"
+       x="19.316126"
+       y="532.71069" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="36.669441"
+       y="542.99646"
+       id="text5209"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5211"
+         x="36.669441"
+         y="542.99646">NoiseAdjustGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="44.445709"
+       y="558.85358"
+       id="text5721"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5723"
+         x="44.445709"
+         y="558.85358"
+         style="font-size:6px">Noise Prewitening</tspan></text>
+    <rect
+       y="627.30786"
+       x="19.316126"
+       height="31.714287"
+       width="104.11561"
+       id="rect5725"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5727"
+       y="637.59357"
+       x="50.823345"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="637.59357"
+         x="50.823345"
+         id="tspan5729"
+         sodipodi:role="line">PCAGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5731"
+       y="647.45068"
+       x="71.22744"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="647.45068"
+         x="71.22744"
+         id="tspan5733"
+         sodipodi:role="line">Virtual Channels</tspan><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="654.95068"
+         x="71.22744"
+         sodipodi:role="line"
+         id="tspan5737">Principal Components</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5768"
+       width="104.11561"
+       height="31.714287"
+       x="19.316126"
+       y="674.99121" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="33.340145"
+       y="683.99121"
+       id="text5770"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5772"
+         x="33.340145"
+         y="683.99121">CoilReductionGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="71.22744"
+       y="695.13403"
+       id="text5774"
+       sodipodi:linespacing="125%"><tspan
+         id="tspan5778"
+         sodipodi:role="line"
+         x="71.22744"
+         y="695.13403"
+         style="font-size:6px;text-align:center;text-anchor:middle">Reduce Channels</tspan><tspan
+         sodipodi:role="line"
+         x="71.22744"
+         y="702.63403"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5782">e.g 32 -> 16</tspan></text>
+    <rect
+       y="580.81476"
+       x="19.316126"
+       height="31.714287"
+       width="104.11561"
+       id="rect5830"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5832"
+       y="588.52905"
+       x="27.484425"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="588.52905"
+         x="27.484425"
+         id="tspan5834"
+         sodipodi:role="line"
+         style="font-size:5.4000001px">RemoveROOversamplingGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5836"
+       y="600.95764"
+       x="71.337303"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan5840"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="600.95764"
+         x="71.337303"
+         sodipodi:role="line">Reduce Readout Length</tspan><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="608.45764"
+         x="71.337303"
+         sodipodi:role="line"
+         id="tspan5895">e.g. 256 -> 128</tspan></text>
+    <rect
+       y="731.96539"
+       x="19.316126"
+       height="31.714287"
+       width="104.11561"
+       id="rect5869"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5871"
+       y="740.10822"
+       x="44.801079"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="740.10822"
+         x="44.801079"
+         id="tspan5873"
+         sodipodi:role="line">GrappaGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5875"
+       y="752.10822"
+       x="71.353416"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan5879"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="752.10822"
+         x="71.353416"
+         sodipodi:role="line">Calculate Image Space</tspan><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="759.60822"
+         x="71.353416"
+         sodipodi:role="line"
+         id="tspan5899">GRAPPA Unmixing Coefficients</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5881"
+       width="104.11561"
+       height="31.714287"
+       x="19.316126"
+       y="778.66675" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="27.672955"
+       y="787.66675"
+       id="text5883"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5885"
+         x="27.672955"
+         y="787.66675">GrappaUnmixingGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="71.477928"
+       y="798.80963"
+       id="text5887"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="71.477928"
+         y="798.80963"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5891">FFT of Raw Data</tspan><tspan
+         sodipodi:role="line"
+         x="71.477928"
+         y="806.30963"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5903">GRAPPA Unmixing</tspan></text>
+    <rect
+       y="825.3681"
+       x="19.316126"
+       height="31.714287"
+       width="104.11561"
+       id="rect5905"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5907"
+       y="834.3681"
+       x="45.001469"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="834.3681"
+         x="45.001469"
+         id="tspan5909"
+         sodipodi:role="line">ExtractGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5911"
+       y="850.56177"
+       x="71.177635"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan5915"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="850.56177"
+         x="71.177635"
+         sodipodi:role="line">Extract Magnitude from Complex</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5917"
+       width="104.11561"
+       height="31.714287"
+       x="19.316126"
+       y="872.06946" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="40.186821"
+       y="881.06946"
+       id="text5919"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5921"
+         x="40.186821"
+         y="881.06946">AutoScaleGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="71.174706"
+       y="892.21234"
+       id="text5923"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="71.174706"
+         y="892.21234"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5927">Histogram based scaling</tspan><tspan
+         sodipodi:role="line"
+         x="71.174706"
+         y="899.71234"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5945">prior to conversion to short</tspan></text>
+    <rect
+       y="918.77081"
+       x="19.316126"
+       height="31.714287"
+       width="104.11561"
+       id="rect5929"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5931"
+       y="927.77081"
+       x="35.558502"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="927.77081"
+         x="35.558502"
+         id="tspan5933"
+         sodipodi:role="line">FloatToShortGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5935"
+       y="940.42896"
+       x="71.353416"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         id="tspan5939"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="940.42896"
+         x="71.353416"
+         sodipodi:role="line">Generate 16-bit unsigned images</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221000000000;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5975"
+       width="104.11561"
+       height="31.714287"
+       x="19.316126"
+       y="965.47217" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="36.374126"
+       y="974.47217"
+       id="text5977"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5979"
+         x="36.374126"
+         y="974.47217">ImageFinishGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="71.152733"
+       y="987.13031"
+       id="text5981"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="71.152733"
+         y="987.13031"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5985">Return Images to Gadgetron</tspan><tspan
+         sodipodi:role="line"
+         x="71.152733"
+         y="994.63031"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5989">Images are forwarded to client</tspan></text>
+    <rect
+       style="fill:#97d25d;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.29617276999999997;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect6068"
+       width="103.20275"
+       height="42.245083"
+       x="139.32896"
+       y="724.11749" />
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="146.82634"
+       y="735.22919"
+       id="text6070"
+       sodipodi:linespacing="125%"><tspan
+         style="font-size:10px"
+         sodipodi:role="line"
+         id="tspan6072"
+         x="146.82634"
+         y="735.22919">GrappaCalculator</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text6074"
+       y="751.39166"
+       x="190.89909"
+       style="font-size:12px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:8px;text-align:center;text-anchor:middle"
+         y="751.39166"
+         x="190.89909"
+         id="tspan6076"
+         sodipodi:role="line">GPU Based</tspan><tspan
+         id="tspan6104"
+         style="font-size:8px;text-align:center;text-anchor:middle"
+         y="761.39166"
+         x="192.17253"
+         sodipodi:role="line">Active Object </tspan></text>
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path6110"
+       d="m 123.92093,746.95015 14.30769,0"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path6930"
+       d="m 190.93035,766.96331 0,13.8896"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <rect
+       y="782.74701"
+       x="137.76913"
+       height="32.324883"
+       width="106.32242"
+       id="rect7118"
+       style="fill:#97d25d;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.28393653000000002;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text7120"
+       y="798.90942"
+       x="142.00163"
+       style="font-size:12px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:9px"
+         y="798.90942"
+         x="142.00163"
+         id="tspan7122"
+         sodipodi:role="line">Image Space Weights</tspan></text>
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path7515"
+       d="m 71.37393,707.32695 0,22.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 71.37393,763.79886 0,12.85714"
+       id="path7517"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 71.37393,810.43664 0,12.85714"
+       id="path7521"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path7523"
+       d="m 71.37393,857.28363 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 71.37393,903.96474 0,12.85714"
+       id="path7525"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path7527"
+       d="m 71.37393,950.64584 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path4301"
+       d="m 71.37393,565.4261 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 138.97862,798.95015 -14.30769,0"
+       id="path6496"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 71.37393,613.40834 0,12.85714"
+       id="path8376"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path8378"
+       d="m 71.37393,660.38044 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.92284429;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1.84568849, 0.92284424;stroke-dashoffset:0"
+       id="rect8981"
+       width="119.12972"
+       height="119.48805"
+       x="131.82491"
+       y="702.52356" />
+    <text
+       xml:space="preserve"
+       style="font-size:6px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="135.86551"
+       y="713.95862"
+       id="text8983"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan8985"
+         x="135.86551"
+         y="713.95862">Managed by threads outside Gadget</tspan></text>
+  </g>
+</svg>
diff --git a/doc/manual/figs/hdfview_image_view.png b/doc/manual/figs/hdfview_image_view.png
new file mode 100644
index 0000000..05248e3
Binary files /dev/null and b/doc/manual/figs/hdfview_image_view.png differ
diff --git a/doc/manual/figs/hdfview_image_view_setting.png b/doc/manual/figs/hdfview_image_view_setting.png
new file mode 100644
index 0000000..0595227
Binary files /dev/null and b/doc/manual/figs/hdfview_image_view_setting.png differ
diff --git a/doc/manual/figs/hdfview_mri_testdata.png b/doc/manual/figs/hdfview_mri_testdata.png
new file mode 100644
index 0000000..64faeb3
Binary files /dev/null and b/doc/manual/figs/hdfview_mri_testdata.png differ
diff --git a/doc/manual/figs/math/HOWTO.txt b/doc/manual/figs/math/HOWTO.txt
new file mode 100644
index 0000000..e746453
--- /dev/null
+++ b/doc/manual/figs/math/HOWTO.txt
@@ -0,0 +1,5 @@
+Images converted using this process:
+http://infohost.nmt.edu/tcc/help/pubs/docbook/web/math-procedure.html
+
+Sadly I could not get thr PDF/EPS/SVG accepted in fo. 
+Hence reverting to jpeg for ro also.
diff --git a/doc/manual/figs/math/lls.jpg b/doc/manual/figs/math/lls.jpg
new file mode 100644
index 0000000..2d20028
Binary files /dev/null and b/doc/manual/figs/math/lls.jpg differ
diff --git a/doc/manual/figs/math/lls.tex b/doc/manual/figs/math/lls.tex
new file mode 100644
index 0000000..bea6fdc
--- /dev/null
+++ b/doc/manual/figs/math/lls.tex
@@ -0,0 +1,11 @@
+% lamath.tex: Sample of LaTeX math for inclusion in DocBook
+%
+\documentclass[leqno]{article}
+\usepackage{bm} 
+\pagestyle{empty}
+\setlength{\textwidth}{6in}
+\begin{document}
+\[
+\min_{\mathbf{u}} \left( \left\| \mathbf{E}\mathbf{u}-\mathbf{m} \right\|_2^2 + \lambda \left\| \mathbf{R} \mathbf{u} -\mathbf{p} \right\|_2^2 \right)
+\]
+\end{document}
diff --git a/doc/manual/figs/math/lls_form.jpg b/doc/manual/figs/math/lls_form.jpg
new file mode 100644
index 0000000..9e7b76f
Binary files /dev/null and b/doc/manual/figs/math/lls_form.jpg differ
diff --git a/doc/manual/figs/math/lls_form.tex b/doc/manual/figs/math/lls_form.tex
new file mode 100644
index 0000000..9aa4231
--- /dev/null
+++ b/doc/manual/figs/math/lls_form.tex
@@ -0,0 +1,11 @@
+% lamath.tex: Sample of LaTeX math for inclusion in DocBook
+%
+\documentclass[leqno]{article}
+\usepackage{bm} 
+\pagestyle{empty}
+\setlength{\textwidth}{6in}
+\begin{document}
+\[
+\left( \mathbf{E}^H \mathbf{E} + \lambda \mathbf{R}^H \mathbf{R} \right) \mathbf{u} =  \mathbf{E}^H \mathbf{m} + \mathbf{R}^H \mathbf{p} 
+\]
+\end{document}
diff --git a/doc/manual/figs/math/sb.jpg b/doc/manual/figs/math/sb.jpg
new file mode 100644
index 0000000..18a4fdb
Binary files /dev/null and b/doc/manual/figs/math/sb.jpg differ
diff --git a/doc/manual/figs/math/sb.tex b/doc/manual/figs/math/sb.tex
new file mode 100644
index 0000000..11f679e
--- /dev/null
+++ b/doc/manual/figs/math/sb.tex
@@ -0,0 +1,14 @@
+% lamath.tex: Sample of LaTeX math for inclusion in DocBook
+%
+\documentclass[leqno]{article}
+\usepackage{bm} 
+\pagestyle{empty}
+\setlength{\textwidth}{6in}
+\begin{document}
+\[
+\min_{\mathbf{u}} \left| \mathbf{u} \right|_{TV} + \lambda \left\| \mathbf{E}\mathbf{u}-\mathbf{m} \right\|_2^2
+\]
+\[
+\min_{\mathbf{u}} \left| \mathbf{u} \right|_{TV} s.t. \left\| \mathbf{E}\mathbf{u}-\mathbf{m} \right\|_2^2<\sigma^2
+\]
+\end{document}
diff --git a/doc/manual/figs/python.png b/doc/manual/figs/python.png
new file mode 100644
index 0000000..ddaa8ed
Binary files /dev/null and b/doc/manual/figs/python.png differ
diff --git a/doc/manual/figs/python.svg b/doc/manual/figs/python.svg
new file mode 100644
index 0000000..3cf82a3
--- /dev/null
+++ b/doc/manual/figs/python.svg
@@ -0,0 +1,635 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="225"
+   height="540"
+   id="svg8632"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="python.svg"
+   inkscape:export-filename="/Users/hansenms/Documents/mrprogs/gadgetron/doc/manual/figs/python.png"
+   inkscape:export-xdpi="299.784"
+   inkscape:export-ydpi="299.784">
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="1"
+     inkscape:pageshadow="2"
+     inkscape:zoom="3.959798"
+     inkscape:cx="175.64027"
+     inkscape:cy="411.47029"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="2560"
+     inkscape:window-height="1305"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     units="in" />
+  <defs
+     id="defs8634">
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend"
+       style="overflow:visible">
+      <path
+         id="path3849"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutL"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutL-9"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path4614-0"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="scale(0.8,0.8)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Mend"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path3972"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend-6"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path3849-4"
+         style="font-size:12px;fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutL-9B"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutL-9B"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path5674"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="marker-start:none;stroke:#7dbbc1;stroke-width:1pt;fill:#7dbbc1;fill-rule:evenodd"
+         transform="scale(0.8,0.8)" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutL-9Bp"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutL-9Bp"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path5828"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="fill-rule:evenodd;marker-start:none;stroke:#053df0;stroke-width:1pt;fill:#053df0"
+         transform="scale(0.8,0.8)" />
+    </marker>
+  </defs>
+  <metadata
+     id="metadata8637">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(0,-512.35975)">
+    <text
+       xml:space="preserve"
+       style="font-size:3.29999995px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="-0.90061849"
+       y="609.21912"
+       id="text9939"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9941"
+         x="-0.90061849"
+         y="609.21912" /></text>
+    <g
+       id="g5829"
+       transform="translate(0,-1.2626953)">
+      <rect
+         inkscape:export-ydpi="599.78741"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         y="526.03345"
+         x="30.816908"
+         height="46.799793"
+         width="42.263618"
+         id="rect9947-1-8-41"
+         style="opacity:0.93999999;fill:#97d25d;fill-opacity:1;stroke:none" />
+      <text
+         sodipodi:linespacing="125%"
+         id="text9949-9-5-1"
+         y="533.77313"
+         x="51.774353"
+         style="font-size:4.72979879px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           id="tspan9953-3-1-9"
+           y="533.77313"
+           x="51.774353"
+           sodipodi:role="line">Python Gadget</tspan></text>
+    </g>
+    <rect
+       style="opacity:0.93999999000000001;fill:#978bea;fill-opacity:1;stroke:none"
+       id="rect9947-1-8-41-5"
+       width="132.46913"
+       height="13.464759"
+       x="47.457645"
+       y="604.66888"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741" />
+    <text
+       xml:space="preserve"
+       style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       x="91.711807"
+       y="612.70612"
+       id="text9905-5"
+       sodipodi:linespacing="125%"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-ydpi="599.78741"><tspan
+         sodipodi:role="line"
+         id="tspan9907-6"
+         x="91.711807"
+         y="612.70612">Python Interpreter</tspan></text>
+    <g
+       id="g5585"
+       transform="translate(-2.5253814,-23.233439)">
+      <rect
+         inkscape:export-ydpi="599.78741"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         y="655.14764"
+         x="31.62396"
+         height="86.195747"
+         width="71.607445"
+         id="rect9947-1-8-41-5-1"
+         style="opacity:0.93999999;fill:#97d25d;fill-opacity:1;stroke:none" />
+      <text
+         inkscape:export-ydpi="599.78741"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         sodipodi:linespacing="125%"
+         id="text9905-5-8"
+         y="662.67981"
+         x="47.997078"
+         style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           y="662.67981"
+           x="47.997078"
+           id="tspan9907-6-9"
+           sodipodi:role="line">Python Module 1</tspan></text>
+      <g
+         transform="translate(-33.587572,39.718279)"
+         id="g5455">
+        <text
+           xml:space="preserve"
+           style="font-size:4px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:American Typewriter;-inkscape-font-specification:American Typewriter"
+           x="72.225906"
+           y="641.48022"
+           id="text4597"
+           sodipodi:linespacing="125%"><tspan
+             sodipodi:role="line"
+             id="tspan4599"
+             x="72.225906"
+             y="641.48022">def set_gadget_ref:</tspan><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="646.48022"
+             id="tspan4615">     #Code for setting reference</tspan><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="651.48022"
+             id="tspan4613"> </tspan><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="656.48022"
+             id="tspan4601" /><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="661.48022"
+             id="tspan5863">def config_function:</tspan><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="666.48022"
+             id="tspan4617">    #Code for processing conf</tspan><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="671.48022"
+             id="tspan4607" /><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="676.48022"
+             id="tspan4609" /><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="681.48022"
+             id="tspan4611">def recon_function:</tspan><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="686.48022"
+             id="tspan6257" /><tspan
+             sodipodi:role="line"
+             x="72.225906"
+             y="691.48022"
+             id="tspan4619"
+             style="font-weight:bold;-inkscape-font-specification:American Typewriter Bold">       #Actual recon code</tspan></text>
+        <rect
+           style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+           id="rect5425"
+           width="65.154839"
+           height="70.409737"
+           x="68.437836"
+           y="627.33807" />
+        <text
+           xml:space="preserve"
+           style="font-size:4px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+           x="70.458138"
+           y="631.88373"
+           id="text5427"
+           sodipodi:linespacing="125%"><tspan
+             sodipodi:role="line"
+             id="tspan5429"
+             x="70.458138"
+             y="631.88373">MyPythonScript1.py</tspan></text>
+      </g>
+    </g>
+    <g
+       id="g5606"
+       transform="translate(-0.5253814,-12.374369)">
+      <rect
+         style="opacity:0.93999999;fill:#97d2ea;fill-opacity:1;stroke:none"
+         id="rect5472"
+         width="71.607445"
+         height="86.195747"
+         x="122.03262"
+         y="644.28857"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-ydpi="599.78741" />
+      <text
+         xml:space="preserve"
+         style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+         x="140.17349"
+         y="651.82074"
+         id="text5474"
+         sodipodi:linespacing="125%"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-ydpi="599.78741"><tspan
+           sodipodi:role="line"
+           id="tspan5476"
+           x="140.17349"
+           y="651.82074">Python Module 2</tspan></text>
+      <g
+         id="g5478"
+         transform="translate(56.821082,28.859139)">
+        <text
+           sodipodi:linespacing="125%"
+           id="text5480"
+           y="641.48022"
+           x="72.225906"
+           style="font-size:4px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:American Typewriter;-inkscape-font-specification:American Typewriter"
+           xml:space="preserve"><tspan
+             y="641.48022"
+             x="72.225906"
+             id="tspan5482"
+             sodipodi:role="line">def set_gadget_ref:</tspan><tspan
+             id="tspan5484"
+             y="646.48022"
+             x="72.225906"
+             sodipodi:role="line">     #Code for setting reference</tspan><tspan
+             id="tspan5486"
+             y="651.48022"
+             x="72.225906"
+             sodipodi:role="line"> </tspan><tspan
+             id="tspan5488"
+             y="656.48022"
+             x="72.225906"
+             sodipodi:role="line" /><tspan
+             id="tspan5492"
+             y="661.48022"
+             x="72.225906"
+             sodipodi:role="line">def config_function:</tspan><tspan
+             id="tspan5494"
+             y="666.48022"
+             x="72.225906"
+             sodipodi:role="line">    #Code for processing conf</tspan><tspan
+             id="tspan5496"
+             y="671.48022"
+             x="72.225906"
+             sodipodi:role="line" /><tspan
+             id="tspan5498"
+             y="676.48022"
+             x="72.225906"
+             sodipodi:role="line" /><tspan
+             id="tspan5500"
+             y="681.48022"
+             x="72.225906"
+             sodipodi:role="line">def recon_function:</tspan><tspan
+             y="686.48022"
+             x="72.225906"
+             sodipodi:role="line"
+             id="tspan6459" /><tspan
+             id="tspan5502"
+             y="691.48022"
+             x="72.225906"
+             sodipodi:role="line"
+             style="font-weight:bold;-inkscape-font-specification:American Typewriter Bold">       #Actual recon code</tspan></text>
+        <rect
+           y="627.33807"
+           x="68.437836"
+           height="70.409737"
+           width="65.154839"
+           id="rect5504"
+           style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+        <text
+           sodipodi:linespacing="125%"
+           id="text5506"
+           y="631.88373"
+           x="70.458138"
+           style="font-size:4px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+           xml:space="preserve"><tspan
+             y="631.88373"
+             x="70.458138"
+             id="tspan5508"
+             sodipodi:role="line">MyPythonScript2.py</tspan></text>
+      </g>
+    </g>
+    <rect
+       inkscape:export-ydpi="599.78741"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       y="582.69806"
+       x="46.700031"
+       height="13.464759"
+       width="132.46913"
+       id="rect5535"
+       style="opacity:0.93999999000000001;fill:#978bea;fill-opacity:1;stroke:none" />
+    <text
+       inkscape:export-ydpi="599.78741"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       sodipodi:linespacing="125%"
+       id="text5537"
+       y="590.73529"
+       x="86.279831"
+       style="font-size:4.72979832px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         y="590.73529"
+         x="86.279831"
+         id="tspan5539"
+         sodipodi:role="line">Python Communicator</tspan></text>
+    <g
+       id="g5839">
+      <rect
+         style="opacity:0.93999999;fill:#97d2ea;fill-opacity:1;stroke:none"
+         id="rect5548"
+         width="42.263618"
+         height="46.799793"
+         x="141.93369"
+         y="524.77075"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-ydpi="599.78741" />
+      <text
+         xml:space="preserve"
+         style="font-size:4.72979879px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         x="162.89113"
+         y="532.51044"
+         id="text5550"
+         sodipodi:linespacing="125%"><tspan
+           sodipodi:role="line"
+           x="162.89113"
+           y="532.51044"
+           id="tspan5552">Python Gadget</tspan></text>
+    </g>
+    <g
+       id="g5579">
+      <rect
+         style="opacity:0.93999999;fill:#97d25d;fill-opacity:1;stroke:none"
+         id="rect5557"
+         width="36.455242"
+         height="19.273136"
+         x="4.805479"
+         y="591.94592"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-ydpi="599.78741" />
+      <text
+         xml:space="preserve"
+         style="font-size:4.72979879px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         x="22.926865"
+         y="600.38965"
+         id="text5561"
+         sodipodi:linespacing="125%"><tspan
+           sodipodi:role="line"
+           x="22.926865"
+           y="600.38965"
+           id="tspan5563">Gadget</tspan><tspan
+           id="tspan5565"
+           sodipodi:role="line"
+           x="22.926865"
+           y="606.30188">Reference</tspan></text>
+    </g>
+    <g
+       id="g5573">
+      <rect
+         inkscape:export-ydpi="599.78741"
+         inkscape:export-xdpi="599.78741"
+         inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+         y="591.44086"
+         x="182.84486"
+         height="19.273136"
+         width="36.455242"
+         id="rect5559"
+         style="opacity:0.93999999000000001;fill:#97d2ea;fill-opacity:1;stroke:none" />
+      <text
+         sodipodi:linespacing="125%"
+         id="text5567"
+         y="599.88458"
+         x="200.96625"
+         style="font-size:4.72979879px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         xml:space="preserve"><tspan
+           id="tspan5569"
+           y="599.88458"
+           x="200.96625"
+           sodipodi:role="line">Gadget</tspan><tspan
+           y="605.79681"
+           x="200.96625"
+           sodipodi:role="line"
+           id="tspan5571">Reference</tspan></text>
+    </g>
+    <rect
+       inkscape:export-ydpi="599.78741"
+       inkscape:export-xdpi="599.78741"
+       inkscape:export-filename="/Users/hansenms/Documents/Publications/WIP/Gadgetron/manuscript/figures/gadget.pdf.png"
+       y="524.77075"
+       x="86.375298"
+       height="46.799793"
+       width="42.263618"
+       id="rect5627"
+       style="opacity:0.93999999000000001;fill:#97d25d;fill-opacity:0.23555556;stroke:none" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5629"
+       y="532.51044"
+       x="107.43204"
+       style="font-size:4.72979879px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+       xml:space="preserve"><tspan
+         id="tspan5631"
+         y="532.51044"
+         x="107.43204"
+         sodipodi:role="line">C/C++ Gadget</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 6.8185297,546.14706 23.2335093,0"
+       id="path5633"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path5827"
+       d="m 73.23606,546.14706 12.626907,0"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 128.54191,546.14706 12.62691,0"
+       id="path5844"
+       inkscape:connector-curvature="0" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path5846"
+       d="m 184.1003,546.14706 23.23351,-0.18941"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 63.639611,571.14834 0,11.11168"
+       id="path5857"
+       inkscape:connector-curvature="0" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path5859"
+       d="m 63.639611,595.89708 0,8.08122"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 63.639611,618.12043 0,85.86298"
+       id="path5861"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 39.39595,194.6541 -23.359779,0 -0.505076,-95.333148"
+       id="path5867"
+       inkscape:connector-curvature="0"
+       transform="translate(0,512.35975)"
+       sodipodi:nodetypes="ccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 15.531095,78.865363 0,-13.51079 29.673231,0 0.252539,-6.187184"
+       id="path6061"
+       inkscape:connector-curvature="0"
+       transform="translate(0,512.35975)"
+       sodipodi:nodetypes="cccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.50000000000000000;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:0.50000000000000000, 0.50000000000000000;stroke-dashoffset:0;marker-end:url(#Arrow1Mend)"
+       d="m 45.204326,57.147084 0,-23.233509 27.021581,0"
+       id="path6259"
+       inkscape:connector-curvature="0"
+       transform="translate(0,512.35975)" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path6453"
+       d="m 168.44294,571.40088 0,11.11168"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 168.44294,596.14962 0,8.08122"
+       id="path6455"
+       inkscape:connector-curvature="0" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path6457"
+       d="m 168.44294,618.37297 0,85.86298"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="ccc"
+       inkscape:connector-curvature="0"
+       id="path6461"
+       d="m 174.50386,706.88758 26.76903,0 0,-95.20688"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" />
+    <path
+       sodipodi:nodetypes="cccc"
+       inkscape:connector-curvature="0"
+       id="path6463"
+       d="m 201.27289,591.73018 0,-13.51079 -27.27411,0 0,-6.18718"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path6465"
+       d="m 173.99878,570.26445 0,-24.3068 8.58629,0"
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.5, 0.5;stroke-dashoffset:0;marker-end:url(#Arrow1Mend)"
+       sodipodi:nodetypes="ccc" />
+  </g>
+</svg>
diff --git a/doc/manual/figs/sense_cg.png b/doc/manual/figs/sense_cg.png
new file mode 100644
index 0000000..67e60ae
Binary files /dev/null and b/doc/manual/figs/sense_cg.png differ
diff --git a/doc/manual/figs/sense_sbc.png b/doc/manual/figs/sense_sbc.png
new file mode 100644
index 0000000..974d90c
Binary files /dev/null and b/doc/manual/figs/sense_sbc.png differ
diff --git a/doc/manual/figs/shepp.png b/doc/manual/figs/shepp.png
new file mode 100644
index 0000000..b0c5137
Binary files /dev/null and b/doc/manual/figs/shepp.png differ
diff --git a/doc/manual/figs/shepp_blurred.png b/doc/manual/figs/shepp_blurred.png
new file mode 100644
index 0000000..e12159e
Binary files /dev/null and b/doc/manual/figs/shepp_blurred.png differ
diff --git a/doc/manual/figs/shepp_deblurred_cg.png b/doc/manual/figs/shepp_deblurred_cg.png
new file mode 100644
index 0000000..9b03331
Binary files /dev/null and b/doc/manual/figs/shepp_deblurred_cg.png differ
diff --git a/doc/manual/figs/shepp_deblurred_sb.png b/doc/manual/figs/shepp_deblurred_sb.png
new file mode 100644
index 0000000..9dc4360
Binary files /dev/null and b/doc/manual/figs/shepp_deblurred_sb.png differ
diff --git a/doc/manual/figs/shepp_denoised.png b/doc/manual/figs/shepp_denoised.png
new file mode 100644
index 0000000..30b9f0b
Binary files /dev/null and b/doc/manual/figs/shepp_denoised.png differ
diff --git a/doc/manual/figs/shepp_iteration.png b/doc/manual/figs/shepp_iteration.png
new file mode 100644
index 0000000..2700719
Binary files /dev/null and b/doc/manual/figs/shepp_iteration.png differ
diff --git a/doc/manual/figs/shepp_noisy.png b/doc/manual/figs/shepp_noisy.png
new file mode 100644
index 0000000..52ccb42
Binary files /dev/null and b/doc/manual/figs/shepp_noisy.png differ
diff --git a/doc/manual/figs/simple2dft.png b/doc/manual/figs/simple2dft.png
new file mode 100644
index 0000000..b60f27f
Binary files /dev/null and b/doc/manual/figs/simple2dft.png differ
diff --git a/doc/manual/figs/simple2dft.svg b/doc/manual/figs/simple2dft.svg
new file mode 100644
index 0000000..388f772
--- /dev/null
+++ b/doc/manual/figs/simple2dft.svg
@@ -0,0 +1,355 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="744.09448819"
+   height="1052.3622047"
+   id="svg8987"
+   version="1.1"
+   inkscape:version="0.48.2 r9819"
+   sodipodi:docname="simple2dft.svg"
+   inkscape:export-filename="/home/hansenms/mrprogs/gadgetron/doc/manual/figs/simple2dft.png"
+   inkscape:export-xdpi="299.97357"
+   inkscape:export-ydpi="299.97357">
+  <defs
+     id="defs8989">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Mend"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path6124"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="marker9053"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path9055"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="marker9057"
+       style="overflow:visible">
+      <path
+         inkscape:connector-curvature="0"
+         id="path9059"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="2.8"
+     inkscape:cx="456.5887"
+     inkscape:cy="530.90344"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1587"
+     inkscape:window-height="891"
+     inkscape:window-x="64"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0" />
+  <metadata
+     id="metadata8992">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1">
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5207"
+       width="104.11561"
+       height="31.714287"
+       x="322.9422"
+       y="445.36478" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="340.2955"
+       y="455.65054"
+       id="text5209"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5211"
+         x="340.2955"
+         y="455.65054">AccumulatorGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="348.07178"
+       y="463.72195"
+       id="text5721"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5723"
+         x="348.07178"
+         y="463.72195"
+         style="font-size:6px">Collecting k-space</tspan></text>
+    <rect
+       y="539.96198"
+       x="322.9422"
+       height="31.714287"
+       width="104.11561"
+       id="rect5725"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text5727"
+       y="550.24768"
+       x="339.80655"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="550.24768"
+         x="339.80655"
+         id="tspan5729"
+         sodipodi:role="line">CropCombineGadget</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text5731"
+       y="560.1048"
+       x="374.85352"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="560.1048"
+         x="374.85352"
+         sodipodi:role="line"
+         id="tspan5737">Remove Readout Oversampling</tspan><tspan
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         y="567.6048"
+         x="374.85352"
+         sodipodi:role="line"
+         id="tspan9162">Combine Channels (RMS)</tspan></text>
+    <rect
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5768"
+       width="104.11561"
+       height="31.714287"
+       x="322.9422"
+       y="587.64532" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="340.53766"
+       y="598.43103"
+       id="text5770"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan5772"
+         x="340.53766"
+         y="598.43103">ImageFinishGadget</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="374.85352"
+       y="611.35956"
+       id="text5774"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="374.85352"
+         y="611.35956"
+         style="font-size:6px;text-align:center;text-anchor:middle"
+         id="tspan5782">Return Images to Client</tspan></text>
+    <rect
+       y="493.46884"
+       x="322.9422"
+       height="31.714287"
+       width="104.11561"
+       id="rect5830"
+       style="fill:#97d2ea;fill-opacity:1;stroke:#dcdcdc;stroke-width:0.21841221;stroke-linecap:butt;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path4301"
+       d="m 375,478.08019 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)"
+       d="m 375,526.06243 0,12.85714"
+       id="path8376"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path8378"
+       d="m 375,573.03452 0,12.85714"
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Mend)" />
+    <text
+       sodipodi:linespacing="125%"
+       id="text9156"
+       y="511.72198"
+       x="354.58121"
+       style="font-size:7.19999981px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       xml:space="preserve"><tspan
+         y="511.72198"
+         x="354.58121"
+         id="tspan9158"
+         sodipodi:role="line">FFTGadget</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.75691628;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect9185"
+       width="82.409531"
+       height="74.859619"
+       x="457.15237"
+       y="448.80029" />
+    <text
+       xml:space="preserve"
+       style="font-size:6px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="475.35715"
+       y="444.50507"
+       id="text9713"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan9715"
+         x="475.35715"
+         y="444.50507">k-space buffer</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 460.71429,452.3315 77.85714,0"
+       id="path9717"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9719"
+       d="m 460.71429,457.88466 77.85714,0"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 460.71429,463.43782 77.85714,0"
+       id="path9721"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9723"
+       d="m 460.71429,468.99092 77.85714,0"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 460.71429,474.54408 77.85714,0"
+       id="path9725"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9727"
+       d="m 460.71429,480.09725 77.85714,0"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 460.71429,485.65041 77.85714,0"
+       id="path9729"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9731"
+       d="m 460.71429,491.20357 77.85714,0"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 460.71429,496.75673 77.85714,0"
+       id="path9733"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9735"
+       d="m 460.71429,502.30983 77.85714,0"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 460.71429,507.86299 77.85714,0"
+       id="path9737"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       sodipodi:nodetypes="cc"
+       inkscape:connector-curvature="0"
+       id="path9739"
+       d="m 460.71429,513.41615 77.85714,0"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 2;stroke-dashoffset:0"
+       d="m 461.07143,518.96932 77.85714,0"
+       id="path9741"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-mid:none;marker-end:url(#marker9057)"
+       d="m 427.14286,461.6479 c 23.07443,-32.01801 38.58246,-18.63764 53.21428,0"
+       id="path9743"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:6px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="472.85715"
+       y="563.79077"
+       id="text10121"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan10123"></tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:none;marker-end:url(#marker9057)"
+       d="m 457.14286,485.75504 -17.5,0 0,-18.03571 -12.14286,0"
+       id="path10125"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:1,1;stroke-dashoffset:0"
+       d="m 425,467.71933 -50,0 0,9.28571"
+       id="path10687"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccc" />
+  </g>
+</svg>
diff --git a/doc/manual/gadgetron_manual.xml b/doc/manual/gadgetron_manual.xml
new file mode 100644
index 0000000..3ce9d5d
--- /dev/null
+++ b/doc/manual/gadgetron_manual.xml
@@ -0,0 +1,6472 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<book version="5.0" xmlns="http://docbook.org/ns/docbook"
+      xmlns:xlink="http://www.w3.org/1999/xlink"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:ns5="http://www.w3.org/2000/svg"
+      xmlns:ns4="http://www.w3.org/1998/Math/MathML"
+      xmlns:ns3="http://www.w3.org/1999/xhtml"
+      xmlns:ns="http://docbook.org/ns/docbook">
+  <info>
+    <title>Gadgetron Users Guide</title>
+
+    <subtitle>A Medical Image Reconstruction Framework</subtitle>
+
+    <author>
+      <personname><honorific>Dr</honorific> <firstname>Michael
+      Schacht</firstname> <surname>Hansen</surname></personname>
+
+      <email>michael.hansen at nih.gov</email>
+    </author>
+
+    <address>National Heart, Lung, and Blood Institute
+National Institutes of Health, Bethesda, MD, USA</address>
+
+    <author>
+      <personname><honorific>Dr</honorific> <firstname>Thomas
+      Sangild</firstname> <surname>Sørensen</surname></personname>
+
+      <email>sangild at cs.au.dk</email>
+    </author>
+
+    <address>Department of Computer Science and Department of Clinical Medicine
+Aarhus University, Denmark</address>
+    
+    <author>
+      <personname><honorific>Dr</honorific> <firstname>Hui
+      </firstname> <surname>Xue</surname></personname>
+
+      <email>hui.xue at nih.gov</email>
+    </author>
+
+    <address>National Heart, Lung, and Blood Institute
+National Institutes of Health, Bethesda, MD, USA</address>
+
+    <author>
+      <personname><firstname>Revision</firstname>
+      <surname>1.2</surname></personname>
+
+      <email/>
+    </author>
+
+    <edition>1.2</edition>
+  </info>
+
+  <chapter>
+    <title>Introduction</title>
+
+    <sect1>
+      <title>What is the Gadgetron</title>
+
+      <para>The Gadgetron is a streaming data processing framework for medical
+      image reconstruction. It has been developed to make it easier to
+      prototype, test, and deploy new image reconstruction algorithms.</para>
+
+      <para>The framework features a number of reconstruction applications
+      that can be employed directly. Moreover, it contains a wide range of
+      toolboxes with common data structures and algorithms designed for a much
+      broader use. These toolboxes can be used within the streaming framework
+      to create new dedicated reconstruction components or used as shared
+      libraries in standalone (or third party) applications.</para>
+
+      <para>This document serves as an introduction to the Gadgetron framework
+      and provides some "getting started" examples of using it. A scientific
+      paper is also available <xref linkend="hansen12"/>.</para>
+
+      <para>Although the Gadgetron is a generic, multi-modality image
+      reconstruction framework, it was initially developed to support the work
+      of the authors in the field of advanced MRI reconstruction. Specifically
+      to support work on fast image reconstruction, not only on traditional
+      CPU architectures, but also using commodity graphics hardware (GPUs).
+      Some examples that are made publicly available through the Gadgetron
+      framework include fast (re)gridding on the GPU <citation><xref
+      linkend="sorensen08"/></citation>, Cartesian parallel imaging on the GPU
+      <citation><xref linkend="hansen08"/></citation>, and non-Cartesian
+      parallel imaging on the GPU <citation><xref
+      linkend="sorensen09"/></citation>.</para>
+    </sect1>
+
+    <sect1>
+      <title>Revision History</title>
+
+    <sect2>
+        <title>Version 2.5</title>
+
+        <para>Version 2.5 contains a number of extension and enhancements to the Gadgetron. 
+            In particular, a toolbox, named Gadgetron Plus or GtPlus is added to the package.
+            GtPlus toolbox implements the complete reconstruction workflow for ISMRMRD data format 
+            and different parallel imaging modes (embedded, interleaved, seperate etc.). Multiple 
+            linear and non-linear reconstruction algorithms are implemented in this toolbox. The data 
+            accumulation and reconstruction triggering scheme are extended to support on-the-fly reconstruction.
+            Another major extension is to extend the Gadgetron to support cloud computing. This feature is named as 
+            GtPlus Cloud. 
+            A non-exhaustive list of changes can be found below:</para>
+
+        <para><itemizedlist>
+            <listitem>
+              <para>The <classname>GadgetMessageAcquisition</classname>,
+              <classname>GadgetMessageImage</classname>, etc. (previously used
+              to describe MRI raw data and images) have been replaced with the
+              corresponding classes from the ISMRMRD library.</para>
+            </listitem>
+
+            <listitem>
+              <para>There is now a Gadgetron configuration file
+              (<filename>gadgetron.xml</filename>) used to control the port
+              number of the Gadgetron when starting. That makes it easier to
+              maintain the same port for a given installation without
+              supplying it on the command line.</para>
+            </listitem>
+
+            <listitem>
+              <para>The dependency on TinyXML has been almost entirely
+              removed. We are now using a class representations of headers and
+              configuration generated with CodeSynthesis XSD
+              (<uri>http://www.codesynthesis.com/products/xsd/</uri>).</para>
+            </listitem>
+
+            <listitem>
+              <para>All XML representations now have schema definitions to
+              make it easier to validate configuration files etc.</para>
+            </listitem>
+
+            <listitem>
+              <para>New toolbox functionality.</para>
+            </listitem>
+
+            <listitem>
+              <para>Various bug fixes.</para>
+            </listitem>
+          </itemizedlist></para>
+      </sect2>
+        
+      <sect2>
+        <title>Version 1.1</title>
+
+        <para>Version 1.1 contains multiple bug fixes, optimizations and some
+        structural changes. Most notably, the Gadgetron now uses the proposed
+        ISMRM Raw Data format (<uri>http://ismrmrd.sourceforge.net</uri>)
+        throughout the MRI specific Gadgets. A non-exhaustive list of changes
+        can be found below:</para>
+
+        <para><itemizedlist>
+            <listitem>
+              <para>The <classname>GadgetMessageAcquisition</classname>,
+              <classname>GadgetMessageImage</classname>, etc. (previously used
+              to describe MRI raw data and images) have been replaced with the
+              corresponding classes from the ISMRMRD library.</para>
+            </listitem>
+
+            <listitem>
+              <para>There is now a Gadgetron configuration file
+              (<filename>gadgetron.xml</filename>) used to control the port
+              number of the Gadgetron when starting. That makes it easier to
+              maintain the same port for a given installation without
+              supplying it on the command line.</para>
+            </listitem>
+
+            <listitem>
+              <para>The dependency on TinyXML has been almost entirely
+              removed. We are now using a class representations of headers and
+              configuration generated with CodeSynthesis XSD
+              (<uri>http://www.codesynthesis.com/products/xsd/</uri>).</para>
+            </listitem>
+
+            <listitem>
+              <para>All XML representations now have schema definitions to
+              make it easier to validate configuration files etc.</para>
+            </listitem>
+
+            <listitem>
+              <para>New toolbox functionality.</para>
+            </listitem>
+
+            <listitem>
+              <para>Various bug fixes.</para>
+            </listitem>
+          </itemizedlist></para>
+      </sect2>
+
+      <sect2>
+        <title>Version 1.0</title>
+
+        <para>First release of the Gadgetron</para>
+      </sect2>
+    </sect1>
+
+    <sect1>
+      <title>Obtaining Gadgetron</title>
+
+      <para>The Gadgetron is made available as a cross-platform source code
+      distribution, which compiles and has been tested to run on Linux, Mac OS
+      X, and Windows 7. Compilation instructions for these platforms are
+      provided below.</para>
+
+      <para>Generally speaking, the Gadgetron is easiest set up on Linux since
+      all dependencies are readily available. If you want to get started
+      quickly with the Gadgetron and happen to not be using Linux, it is easy
+      to install Ubuntu (our preferred Linux distribution) in a virtual
+      machine (e.g. VirtualBox, <uri type="website"
+      xlink:href="https://www.virtualbox.org/">https://www.virtualbox.org/</uri>)
+      and follow the Linux compilation instructions below.</para>
+
+      <para>The Gadgetron is available from the project Sourceforge
+      website:</para>
+
+      <para><uri type="website"
+      xlink:href="http://sourceforge.net/projects/gadgetron">http://sourceforge.net/projects/gadgetron</uri></para>
+
+      <para>This manual is available in HTML form at:</para>
+
+      <para><uri type="website"
+      xlink:href="http://gadgetron.sourceforge.net/latest/manual/gadgetron_manual.html">http://gadgetron.sourceforge.net/latest/manual/gadgetron_manual.html</uri></para>
+
+      <para>Or in PDF form at:</para>
+
+      <para><uri type="website"
+      xlink:href="http://gadgetron.sourceforge.net/latest/manual/gadgetron_manual.pdf">http://gadgetron.sourceforge.net/latest/manual/gadgetron_manual.pdf</uri></para>
+
+      <para>API documentation (generated with Doxygen) is available
+      from:</para>
+
+      <para><uri type="website"
+      xlink:href="http://gadgetron.sourceforge.net/latest/api/">http://gadgetron.sourceforge.net/latest/api/</uri></para>
+
+      <sect2 xml:id="sect.dependencies">
+        <title>Dependencies</title>
+
+        <para>The Gadgetron depends on a number of libraries that can either
+        be downloaded for free or that may already be part of the installation
+        on your workstation. If you are working on a Linux platform you should
+        be able to install all dependencies without compiling anything. The
+        following is a list of the components that you will need. Some are
+        optional.</para>
+
+        <para>To install these components please follow the platform specific
+        installation instructions provided below (<xref
+        linkend="sect.installation"/>).</para>
+
+        <sect3 xml:id="sect.required">
+          <title>Required libraries</title>
+
+          <itemizedlist>
+            <listitem>
+              <para><emphasis>CMake</emphasis>. Available from <uri
+              type="website"
+              xlink:href="http://www.cmake.org/cmake/resources/software.html">http://www.cmake.org/cmake/resources/software.html</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>ADAPTIVE Computing Environment (ACE)</emphasis>.
+              Available from <uri type="website"
+              xlink:href="http://www.cs.wustl.edu/~schmidt/ACE.html">http://www.cs.wustl.edu/~schmidt/ACE.html</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>Boost</emphasis>. Available from <uri
+              type="website"
+              xlink:href="http://www.boost.org">http://www.boost.org</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>FFT3W</emphasis>. Available from <uri
+              type="website"
+              xlink:href="http://www.fftw.org">http://www.fftw.org</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>CodeSynthesis XSD</emphasis>. Available from
+              <uri type="website"
+              xlink:href="http://www.codesynthesis.com/products/xsd">http://www.codesynthesis.com/products/xsd</uri>.</para>
+            </listitem>
+          </itemizedlist>
+        </sect3>
+
+        <sect3 xml:id="sect.optional">
+          <title>Optional libraries</title>
+
+          <itemizedlist>
+            <listitem>
+              <para><emphasis>ISMRM Raw Data format</emphasis> (optional).
+              This is the MRI raw data format used in the streaming framework.
+              Without this library installed you will not be able to
+              reconstruct the provided MRI examples. The toolboxes can however
+              still be used. Available from <uri type="website"
+              xlink:href="http://ismrmrd.sourceforge.net">http://ismrmrd.sourceforge.net</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>HDF5</emphasis> (optional). The ISMRM Raw Data
+              Format (see above) uses the HDF5 file format for storing raw
+              data and images. Available from <uri
+              xlink:href="http://www.hdfgroup.org/HDF5">http://www.hdfgroup.org/HDF5</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>BLAS</emphasis> and <emphasis>LAPACK</emphasis>
+              (optional). Most Linux distributions come with these libraries
+              and they are included on Mac OS X as well, but the vendor
+              depends on your distribution and platform. See specific
+              instructions below for Windows. Without these libraries present
+              some gadgets and toolbox functionality are disabled.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>CUDA</emphasis> (optional). For GPU support you
+              need to install CUDA from Nvidia. You will need a CUDA driver
+              for your graphics card too. Available from <uri type="website"
+              xlink:href="http://developer.nvidia.com/cuda-downloads">http://developer.nvidia.com/cuda-downloads</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>CULA</emphasis> (optional). We use CULA for
+              LAPACK routines on the GPU. This is the only dependency which is
+              not Open Source. You can however download a free (registration
+              required) version of CULA. Available from <uri type="website"
+              xlink:href="http://www.culatools.com/downloads/dense">http://www.culatools.com/downloads/dense</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>QT4</emphasis> (optional). A few standalone
+              applications use QT for creating user interfaces. Available from
+              <uri type="website"
+              xlink:href="http://qt.nokia.com">http://qt.nokia.com</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>Doxygen</emphasis> (optional). Required if you
+              would like to build the API documentation. Available from <uri
+              type="website"
+              xlink:href="http://www.stack.nl/~dimitri/doxygen">http://www.stack.nl/~dimitri/doxygen</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>Docbook</emphasis> (optional). Required if you
+              would like to build the manual (this document). A number of
+              corresponding tools such as <application>xsltproc</application>
+              and <application>fop</application> (for the PDF version of the
+              library) are also needed. Additionally you need the Docbook
+              stylesheets. Available from <uri type="website"
+              xlink:href="http://docbook.sourceforge.net">http://docbook.sourceforge.net</uri>.</para>
+            </listitem>
+
+            <listitem>
+              <para><emphasis>Git (optional)</emphasis>. We use
+              <application>git</application> to manage our source code
+              archives. You can use any source code management system you
+              prefer (or none at all), but if you would like to stay in line
+              with the Gadgetron team, use <application>git</application>.
+              Available from <uri type="website"
+              xlink:href="http://git-scm.com">http://git-scm.com</uri>.</para>
+            </listitem>
+          </itemizedlist>
+        </sect3>
+      </sect2>
+    </sect1>
+
+    <sect1 xml:id="sect.installation">
+      <title>Compiling and Installing Gadgetron</title>
+
+      <sect2 xml:id="sect.linuxinstall">
+        <title>Linux Installation Instructions</title>
+
+        <para>Linux is the preferred operating system to get started using the
+        Gadgetron. All of the required dependencies are included in most major
+        Linux distributions and can be installed easily and without having to
+        compile anything. In the following sections we walk you through the
+        required steps to set up a full Gadgetron installation. We assume that
+        you are starting with a freshly installed Ubuntu 12.04 available from
+        the Ubuntu website (<uri
+        xlink:href="http://www.ubuntu.com">http://www.ubuntu.com</uri>). If
+        you don't have a machine available for installing Ubuntu, you can
+        always try it out in a virtual machine using virtualization software
+        such as VirtualBox (<uri type="website"
+        xlink:href="https://www.virtualbox.org">https://www.virtualbox.org</uri>).</para>
+
+        <para>If you would like to use the GPU components included in the
+        Gadgetron and you have an Nvidia GPU available on your system, please
+        complete the CUDA/CULA installations as described in <xref
+        linkend="section.linuxgpuinstall"/>.</para>
+
+        <para>First install all dependencies for Gadgetron. The following will
+        install everything you need:</para>
+
+        <screen><prompt>user at mycomputer:~$</prompt> <userinput>sudo apt-get install doxygen cmake \
+ libqt4-dev libglew1.6-dev \
+ docbook5-xml docbook-xsl-doc-pdf \
+ docbook-xsl-doc-html docbook-xsl-ns xsltproc \
+ fop git-core libboost-dev libboost-python-dev \
+ libfftw3-dev libace-dev python-dev python-numpy \
+ freeglut3-dev libxi-dev liblapack-dev build-essential \
+ libhdf5-serial-dev h5utils hdf5-tools hdfview \
+ libboost-system-dev libboost-thread-dev xsdcxx \
+ libxerces-c-dev </userinput></screen>
+
+        <para>Next (optional but recommended) download, compile, and install
+        ISMRMRD (there are more detailed instructions on the
+        <uri>http://ismrmrd.sourceforge.net</uri> website):</para>
+
+        <screen>  git clone git://git.code.sf.net/p/ismrmrd/code ismrmrd-code
+  cd ismrmrd-code/
+  mkdir build
+  cd build
+  cmake ../
+  make
+  sudo make install</screen>
+
+        <para>Last command will install the library in
+        <filename>/usr/local/ismrmrd</filename>.</para>
+
+        <para>Now download the Gadgetron archive and compile it. If you have
+        access to a git repository, you can get the code with:</para>
+
+        <screen><userinput>  git clone git://git.code.sf.net/p/gadgetron/gadgetron gadgetron</userinput></screen>
+
+        <para>Configure and build the Gadgetron:</para>
+
+        <screen>  cd gadgetron/
+  mkdir build
+  cd build
+  cmake ../
+  make  </screen>
+
+        <para>Install (default location is
+        <filename>/usr/local/gadgetron</filename>):</para>
+
+        <screen><prompt>user at mycomputer:~/gadgetron/build$</prompt> <userinput>sudo make install</userinput> </screen>
+
+        <para>The final step is to add/modify a few environment variables in
+        your <filename>~/.bashrc</filename> file.</para>
+
+        <programlisting>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/gadgetron/lib:/usr/local/ismrmrd/lib
+export PATH=$PATH:/usr/local/gadgetron/bin:/usr/local/ismrmrd/bin
+export GADGETRON_HOME=/usr/local/gadgetron     </programlisting>
+
+        <para>Rename the example configuration file
+        <filename>GADGETRON_HOME/config/gadgetron.xml.example</filename> to
+        <filename>GADGETRON_HOME/config/gadgetron.xml</filename></para>
+
+        <para>You are now set up to run a simple example reconstruction as
+        outlined in <xref linkend="sect.simpleexample"/>.</para>
+
+        <sect3 xml:id="section.linuxgpuinstall">
+          <title>Installing GPU components (CUDA and CULA) on Linux</title>
+
+          <para>First install the Nvidia driver. The Ubuntu distribution comes
+          with a driver that will work with CUDA in some instances, but we
+          recommend that you install the latest developer driver from the
+          Nvidia website, e.g.:</para>
+
+          <para>Download (to the current directory)
+          <filename>devdriver_4.2_linux_64_295.41.run</filename> from <uri
+          xlink:href="http://developer.nvidia.com/cuda/cuda-downloads">http://developer.nvidia.com/cuda/cuda-downloads</uri>
+          and install the driver:</para>
+
+          <para><screen><userinput>sudo sh ./devdriver_4.2_linux_64_295.41.run</userinput></screen></para>
+
+          <para>The process of getting this driver installed may vary from
+          installation to installation. Specifically, you may need to remove
+          any existing Nvidia driver before installing and you will have to
+          shut down the display manager before installing.</para>
+
+          <para>The display manager can be shut down with:</para>
+
+          <para><screen><userinput>sudo service lightdm stop
+	  </userinput></screen></para>
+
+          <para><remark>Important notice</remark>: Unfortunately we have
+          experienced on several Ubuntu installations that the machine hangs
+          at the splash screen after installation of the Nvidia graphics
+          driver. If you experience this problem, or if you just want to be on
+          the safe side before rebooting, open a terminal (boot in recovery
+          mode) and edit <filename>/etc/default/grub</filename>. Locate the
+          line defining <varname>GRUB_CMDLINE_LINUX_DEFAULT</varname> and
+          change <envar>splash</envar> to <envar>nosplash</envar> (or add
+          <envar>nosplash</envar> if <envar>splash</envar> is not present).
+          Furthermore add <envar>nomodeset</envar>. E.g.
+          <code>GRUB_CMDLINE_LINUX_DEFAULT="quiet nosplash nomodeset"</code>.
+          Finally update the boot manager with the new settings:
+          <userinput>sudo update-grub</userinput></para>
+
+          <para>Next we need to install gcc 4.4 since Ubuntu comes
+          preconfigured with gcc 4.6, which is not compatible with the current
+          versions of the CUDA nvcc compiler.</para>
+
+          <screen><userinput>sudo apt-get install gcc-4.4 g++-4.4 build-essential</userinput></screen>
+
+          <para>Set up alternative systems to allow easy switching between the
+          two versions of gcc/g++</para>
+
+          <screen><userinput>sudo update-alternatives --install /usr/bin/gcc gcc \
+ /usr/bin/gcc-4.6 40 --slave /usr/bin/g++ g++ /usr/bin/g++-4.6
+
+sudo update-alternatives --install /usr/bin/gcc \
+ gcc /usr/bin/gcc-4.4 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.4</userinput></screen>
+
+          <para>Check your gcc compiler (should now be version 4.4.7):</para>
+
+          <screen><userinput>gcc -v</userinput></screen>
+
+          <para>When you want to switch between the two compiler
+          versions:</para>
+
+          <screen><userinput>sudo update-alternatives --config gcc</userinput></screen>
+
+          <para>The final step is to actually install CUDA and CULA. Download
+          the following files (for CUDA release 4.2):</para>
+
+          <itemizedlist>
+            <listitem>
+              <para><filename>cudatoolkit_4.2.9_linux_64_ubuntu11.04.run</filename>
+              from <uri type="website"
+              xlink:href="http://developer.nvidia.com/cuda/cuda-downloads">http://developer.nvidia.com/cuda/cuda-downloads</uri></para>
+            </listitem>
+
+            <listitem>
+              <para>cula_dense_free_R15-linux64.run from <uri type="website"
+              xlink:href="http://www.culatools.com/downloads/dense">http://www.culatools.com/downloads/dense</uri>
+              (free registration required)</para>
+            </listitem>
+          </itemizedlist>
+
+          <para>Go to the folder where the files were downloaded and
+          type:</para>
+
+          <screen><prompt>user at mycomputer:</prompt><userinput>sudo sh ./cudatoolkit_4.2.9_linux_64_ubuntu11.04.run
+sudo sh ./cula_dense_free_R15-linux64.run
+	  </userinput></screen>
+
+          <para>Follow the instructions. When you are done with the
+          installation you may want to add the following to your
+          <filename>~/.bashrc</filename> file.</para>
+
+          <programlisting>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib
+export CULA_ROOT="/usr/local/cula"
+export CULA_INC_PATH="$CULA_ROOT/include"
+export CULA_BIN_PATH_32="$CULA_ROOT/bin"
+export CULA_BIN_PATH_64="$CULA_ROOT/bin64"
+export CULA_LIB_PATH_32="$CULA_ROOT/lib"
+export CULA_LIB_PATH_64="$CULA_ROOT/lib64"
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CULA_LIB_PATH_64    </programlisting>
+
+          <para>You are now ready to compile and run CUDA (and CULA)
+          applications. You may want to download the CUDA SDK from Nvidia to
+          validate your installation but this is not required.</para>
+        </sect3>
+      </sect2>
+
+      <sect2>
+        <title>Mac OS X Installation Instructions</title>
+
+        <para>The following instructions assume that you are starting on a Mac
+        with OS X 10.6.8 (Snow Leopard) installed. Additionally it assumes
+        that you have Xcode (3.2.6) installed. If you have upgraded to Lion or
+        are on an older release, you should still be able to make it all
+        compile, but you may have to make some adjustments.</para>
+
+        <para>We use MacPorts (<uri
+        xlink:href="http://www.macports.org/">http://www.macports.org/</uri>)
+        to install the required dependencies. You may use a different package
+        management system or prefer to install packages manually. In that
+        case, please look at the list of dependencies (<xref
+        linkend="sect.dependencies"/>) and install the required dependencies
+        for the components you would like to use.</para>
+
+        <para>MacPorts is not the fastest way to install packages as they are
+        compiled locally. We use this method here nonetheless to make it
+        easier to follow the instructions. Please be patient when running the
+        <command>port</command> commands.</para>
+
+        <itemizedlist>
+          <listitem>
+            <para>Install MacPorts.</para>
+
+            <para>Download <filename>MacPorts-2.1.2.pkg</filename> from <uri
+            xlink:href="http://www.macports.org/">http://www.macports.org/</uri>.</para>
+
+            <para>Run <command><userinput>sudo port -v
+            selfupdate</userinput></command> to make sure you are up to
+            date.</para>
+          </listitem>
+
+          <listitem>
+            <para>Get your Python installation up to date. Mac OS X ships with
+            Python installed, but it is not a complete distribution. You need
+            to update it if you would like to do Python development with the
+            Gadgetron. If you already have <package>numpy</package> and
+            <package>SciPy</package> installed, you may be able to skip this
+            step. If you do not wish to use Python, you can also skip this
+            step.</para>
+
+            <screen><userinput>sudo port install python27 py27-numpy py27-scipy py27-libxml2</userinput></screen>
+
+            <para>This should install Python 2.7. Now select Python 2.7 as as
+            the active Python installation:</para>
+
+            <screen><userinput>sudo port select python python27</userinput></screen>
+
+            <para>To make sure the build system finds the right version of
+            Python we need to edit a couple of symbolic links manually:</para>
+
+            <screen><userinput>cd /System/Library/Frameworks/Python.framework/Versions
+sudo ln -s /opt/local/Library/Frameworks/Python.framework/Versions/2.7
+sudo rm Current
+sudo ln -s 2.7 Current</userinput></screen>
+          </listitem>
+
+          <listitem>
+            <para>Install Boost. Boost gets special treatment here. Depending
+            on whether you would like to do Python development, you need to
+            install Boost with or without boost_python. If you would like
+            Python:</para>
+
+            <screen><userinput>sudo port install boost +python27</userinput></screen>
+
+            <para>If you don't need Python support:</para>
+
+            <screen><userinput>sudo port install boost</userinput></screen>
+          </listitem>
+
+          <listitem>
+            <para>Now we can install the rest of the packages:</para>
+
+            <screen><userinput>sudo port install git-core cmake libACE \
+fftw-3-single fftw-3 qt4-mac-devel hdf5-18 \
+libxml2 xercesc3
+	    </userinput></screen>
+
+            <para>This may take quite a long time (hours).</para>
+          </listitem>
+
+          <listitem>
+            <para>Install CodeSynthesis XSD</para>
+
+            <para><screen><userinput>wget http://www.codesynthesis.com/download/xsd/3.3/macosx/i686/xsd-3.3.0-i686-macosx.tar.bz2
+tar -xzf xsd-3.3.0-i686-macosx.tar.bz2
+cd xsd-3.3.0-i686-macosx
+sudo cp bin/xsd /usr/local/bin/
+sudo cp -r libxsd/xsd /usr/local/include/</userinput></screen></para>
+          </listitem>
+
+          <listitem>
+            <para>Download, compile, and install ISMRMRD. Detailed
+            instructions can be found at <uri
+            xlink:href="http://ismrmrd.sourceforge.net">http://ismrmrd.sourceforge.net</uri>.</para>
+
+            <para><screen><userinput>git clone git://git.code.sf.net/p/ismrmrd/code ismrmrd-code
+cd ismrmrd-code/
+mkdir build
+cd build/
+cmake ../
+make
+sudo make install</userinput></screen>Last command will install the library in
+            <filename>/usr/local/ismrmrd</filename>.</para>
+
+            <para>Make sure that <filename>/usr/local/ismrmrd/lib</filename>
+            is in your <varname>DYLD_LIBRARY_PATH</varname> environment
+            variable (see below).</para>
+          </listitem>
+
+          <listitem>
+            <para>To visualize HDF5 files you may also want to install HDFView
+            from <uri
+            xlink:href="http://www.hdfgroup.org/ftp/HDF5/hdf-java/hdfview/hdfview_install_macosx_intel64.zip">http://www.hdfgroup.org/ftp/HDF5/hdf-java/hdfview/hdfview_install_macosx_intel64.zip</uri></para>
+          </listitem>
+
+          <listitem>
+            <para>Install CUDA and CULA. If you would like to use the GPU
+            components, you need to install the following:</para>
+
+            <itemizedlist>
+              <listitem>
+                <para>The Nvidia development driver
+                (<filename>devdriver_4.2.10_macos.dmg</filename>) from <uri
+                xlink:href="http://developer.nvidia.com/cuda/cuda-downloads">http://developer.nvidia.com/cuda/cuda-downloads</uri>.</para>
+              </listitem>
+
+              <listitem>
+                <para>The CUDA Toolkit
+                (<filename>cudatoolkit_4.2.9_macos.pkg</filename>) from <uri
+                xlink:href="http://developer.nvidia.com/cuda/cuda-downloads">http://developer.nvidia.com/cuda/cuda-downloads</uri>.</para>
+              </listitem>
+
+              <listitem>
+                <para>The CULA Dense Libraries
+                (<filename>cula_dense_free_R15-osx.dmg</filename>) from <uri
+                xlink:href="http://www.culatools.com/downloads/dense">http://www.culatools.com/downloads/dense</uri>.</para>
+              </listitem>
+            </itemizedlist>
+          </listitem>
+
+          <listitem>
+            <para>Compiling the Gadgetron:</para>
+
+            <screen><prompt>$</prompt> <userinput>cd gadgetron</userinput>
+$ <userinput>mkdir build</userinput>
+$ <userinput>cd build</userinput>
+$ <userinput>make</userinput>
+$ <userinput>sudo make install</userinput></screen>
+
+            <para>The long path for the <package>numpy</package> header files
+            is only needed if you want Python support. You can avoid this by
+            creating a symbolic link:</para>
+
+            <screen>$ <userinput>cd /opt/local/Library/Frameworks/Python.framework</userinput>
+$ <userinput>cd Versions/2.7/include/python2.7</userinput>
+$ <userinput>sudo ln -s ../../lib/python2.7/site-packages/numpy/core/include/numpy</userinput></screen>
+
+            <para>After creating this link you should be able to compile with
+            the following:</para>
+
+            <screen><prompt>$</prompt> <userinput>cd gadgetron</userinput>
+$ <userinput>mkdir build</userinput>
+$ <userinput>cd build</userinput>
+$ <userinput>cmake ../</userinput>
+$ <userinput>make</userinput>
+$ <userinput>sudo make install</userinput></screen>
+          </listitem>
+
+          <listitem>
+            <para>Set environment variables:</para>
+
+            <screen>$ <userinput>export GADGETRON_HOME=/usr/local/gadgetron</userinput>
+$ <userinput>export PATH=$PATH:/usr/local/gadgetron/bin:/usr/local/ismrmrd/bin</userinput>
+$ <userinput>export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:
+/usr/local/gadgetron/lib:/usr/local/ismrmrd/lib</userinput></screen>
+
+            <para>You may wish to add these lines to
+            <filename>~/.bash_profile</filename>, You may also want to add
+            paths to CUDA and CULA libraries if you are using those:</para>
+
+            <screen>$ <userinput>export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:/usr/local/cula/lib64</userinput>
+$ <userinput>export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:/usr/local/cuda/lib</userinput></screen>
+          </listitem>
+
+          <listitem>
+            <para>After compiling and installing, please rename the file
+            <filename>GADGETRON_HOME/config/gadgetron.xml.example</filename>
+            to <filename>GADGETRON_HOME/config/gadgetron.xml</filename></para>
+          </listitem>
+
+          <listitem>
+            <para>Test your Gadgetron by following the instructions in <xref
+            linkend="sect.simpleexample"/>.</para>
+          </listitem>
+        </itemizedlist>
+      </sect2>
+
+      <sect2>
+        <title>Windows Installation Instructions</title>
+
+        <para>It is probably appropriate to start this section with a warning:
+        Windows is not the easiest environment in which to work with the
+        Gadgetron. As indicated in <xref linkend="sect.dependencies"/>, the
+        Gadgetron relies on multiple external libraries. Many of those
+        libraries are not available as easy install packages and must be
+        compiled separately. If you are uncomfortable setting up development
+        tools on Windows, or if you are just looking for a fast and easy way
+        to get started with the Gadgetron, we recommend installing on Ubuntu
+        Linux - possibly using a virtual machine inside Windows (see <xref
+        linkend="sect.linuxinstall"/>).</para>
+
+        <para>The following is a list of steps we have used to install the
+        Gadgetron on a clean Windows 7 (64-bit) machine. It has also been
+        tested successfully on a 32 bit machine (but in this case you choose
+        32 bit packages/configuration where appropriate).</para>
+
+        <para>The Gadgetron distribution also includes a Windows Powershell
+        Script in
+        <uri>doc/windows_installation/GadgetronWindowsInstallation.ps1</uri>,
+        which describes the command line steps for installing the
+        dependencies. You cannot complete the installation by simply running
+        the script. Most likely your Windows machine will not allow you to run
+        the script the directly without changing security settings and the
+        download of some of the dependencies cannot be automated since you
+        have to log in (or provide an email address) when downloading. The
+        script can serve as a guide and we recommend (if you would like to use
+        the script) to open it in the Powershell ISE and execute it
+        line-by-line (or section by section). The general installation steps
+        are written out here too, you may need to make some adjustments for
+        your particular setup.</para>
+
+        <itemizedlist>
+          <listitem>
+            <para>Install Visual Studio 2010 (with Service Pack 1)</para>
+          </listitem>
+
+          <listitem>
+            <para>Install CUDA/CULA (optional, but required for GPU
+            support).</para>
+
+            <para>Download Cuda drivers/toolkit from from</para>
+
+            <para><uri
+            xlink:href="http://developer.nvidia.com/cuda/cuda-downloads">http://developer.nvidia.com/cuda/cuda-downloads</uri>.</para>
+
+            <para>Install Nvidia Developer Driver (Version 301.32)</para>
+
+            <para>Install Nvdia Toolkit (4.2)</para>
+
+            <para>Install gpucomputingsdk</para>
+
+            <para>Install <filename>cula_dense_free_R15-win64.exe</filename>
+            from</para>
+
+            <para><uri
+            xlink:href="http://www.culatools.com/downloads/dense">http://www.culatools.com/downloads/dense</uri>.</para>
+
+            <para>Assuming CULA was installed in <filename>C:\Program
+            Files\CULA\R145</filename>, add</para>
+
+            <para><filename>C:\Program Files\CULA\R15\bin64</filename> to your
+            <varname>PATH</varname> environment variable.</para>
+          </listitem>
+
+          <listitem>
+            <para>Create a folder for external libraries, say
+            <filename>C:\Libraries</filename>.</para>
+          </listitem>
+
+          <listitem>
+            <para>Install FFTW3 (<uri
+            xlink:href="http://www.fftw.org/install/windows.html">http://www.fftw.org/install/windows.html</uri>)</para>
+
+            <para>Copy FFTW3 binaries to
+            <filename>C:\Libraries\FFTW3</filename></para>
+
+            <para>Create *.lib files, on the command line type:</para>
+
+            <programlisting>c:\Libraries\FFTW3>lib /machine:x64 /def:libfftw3f-3.def
+c:\Libraries\FFTW3>lib /machine:x64 /def:libfftw3-3.def
+c:\Libraries\FFTW3>lib /machine:x64 /def:libfftw3l-3.def
+</programlisting>
+
+            <para>Add <filename>C:\Libraries\FFTW3</filename> to <envar>your
+            PATH environment variable</envar>.</para>
+
+            <para>On 32 bit Windows remember to remove the /machine:x64
+            argument, the default is 32 bit.</para>
+          </listitem>
+
+          <listitem>
+            <para>Install ACE (<uri
+            xlink:href="http://download.dre.vanderbilt.edu/">http://download.dre.vanderbilt.edu/</uri>)</para>
+
+            <para>Unpack ACE into C:\Libraries\ACE-6.1.0\ACE_wrappers</para>
+
+            <para>Add <filename>config.h</filename> in
+            <filename>ACE_ROOT/ace/</filename> with the following
+            content:</para>
+
+            <programlisting>//We are on Windows
+#include "ace/config-win32.h" 
+
+//This ensured that INLINE settings 
+//do not vary between Debug and Release modes
+#define ACE_NO_INLINE </programlisting>
+
+            <para>Open the VS 2010 project in the source code archive</para>
+
+            <para>Set build type to Release/x64</para>
+
+            <para>Build (this takes a while)</para>
+
+            <para>Add to <varname>PATH</varname> environment variable:
+            <filename>C:\Libraries\ACE-6.0.5\ACE_wrappers\lib</filename></para>
+          </listitem>
+
+          <listitem>
+            <para>Install Python (optional).</para>
+
+            <para>Regrettably, the off-the-shelf Python header files cannot be
+            compiled in debug mode on Windows. This enforces the Gadgetron
+            framework to be compiled in release mode only if you enable the
+            Python components.</para>
+
+            <para>Install python-2.7.3.amd64 (<uri
+            xlink:href="http://www.python.org">http://www.python.org</uri>)</para>
+
+            <para>Add install folder (e.g. <filename>C:\Python27</filename>)
+            to PATH environment variable</para>
+
+            <para>Add <varname>PYTHON_ROOT</varname> environment
+            variable</para>
+
+            <para>From <uri
+            xlink:href="http://www.lfd.uci.edu/~gohlke/pythonlibs/">http://www.lfd.uci.edu/~gohlke/pythonlibs/</uri>
+            download and install the following (+ additional libraries that
+            you may need for your Python development):</para>
+
+            <para><itemizedlist>
+                <listitem>
+                  <para><filename>numpy-MKL-1.6.2.win-amd64-py2.7</filename></para>
+                </listitem>
+
+                <listitem>
+                  <para><filename>scipy-0.10.1.win-amd64-py2.7</filename></para>
+                </listitem>
+
+                <listitem>
+                  <para><filename>libxml2-python-2.7.8.win-amd64-py2.7</filename></para>
+                </listitem>
+              </itemizedlist></para>
+          </listitem>
+
+          <listitem>
+            <para>Install ACML (BLAS and LAPACK)</para>
+
+            <para>Download <filename>acml4.4.0-win64.exe</filename> from: <uri
+            xlink:href="http://developer.amd.com/downloads/acml4.4.0-win64.exe">http://developer.amd.com/downloads/acml4.4.0-win64.exe</uri></para>
+
+            <para>Install Library in say
+            <filename>C:\Libraries\acml4.4.0</filename></para>
+
+            <para>Add
+            <filename>C:\Libraries\acml4.4.0\win64\lib;C:\Libraries\acml4.4.0\win64_mp\lib</filename>
+            to <varname>your PATH environment variable.</varname></para>
+
+            <para>Notice. Newer versions of the ACML-library are available
+            (version 5.2.0 at the time of preparing this manual) - however,
+            these libraries are distributed without required dependencies and
+            will not work out of the box. We recommend sticking to the earlier
+            version 4.4.0 until these issues have been resolved.</para>
+          </listitem>
+
+          <listitem>
+            <para>Install the newest Boost release (<uri
+            xlink:href="http://www.boost.org">http://www.boost.org</uri>)</para>
+
+            <para>We recommend using the precompiled binaries from BoostPro
+            (e.g. <uri
+            xlink:href="http://boostpro.com/download/x64/boost_1_51_setup.exe">http://boostpro.com/download/x64/boost_1_51_setup.exe</uri>)</para>
+
+            <para>Just install everything, you might need other components
+            later.</para>
+          </listitem>
+
+          <listitem>
+            <para>Install <application>git</application> (if you are using
+            source code management):</para>
+
+            <para>Run the newest installation package named something like
+            Git-*-preview*.exe from: <uri
+            xlink:href="http://code.google.com/p/msysgit/">http://code.google.com/p/msysgit/</uri></para>
+
+            <para>Use run in git bash only option</para>
+
+            <para>Use checkout Windows LF and commit Unix Line feeds</para>
+          </listitem>
+
+          <listitem>
+            <para>Install CMake (<uri
+            xlink:href="http://www.cmake.org/cmake/resources/software.html">http://www.cmake.org/cmake/resources/software.html</uri>)</para>
+
+            <para>Install the latest release (e.g. <uri
+            xlink:href="http://www.cmake.org/files/v2.8/cmake-2.8.9-win32-x86.exe">http://www.cmake.org/files/v2.8/cmake-2.8.9-win32-x86.exe</uri>)</para>
+          </listitem>
+
+          <listitem>
+            <para>Install <application>HDF5</application>:</para>
+
+            <para>You will need the HDFView application to view data files
+            used by the Gadgetron, it can be downloaded rom <uri
+            xlink:href="http://www.hdfgroup.org/HDF5/">http://www.hdfgroup.org/HDF5/</uri>
+            install HDFView :
+            <filename>hdfview_install_win64.exe</filename></para>
+
+            <para>The precompiled binaries (e.g. <uri
+            xlink:href="http://www.hdfgroup.org/ftp/HDF5/current/bin/windows/HDF5189-win64-vs10-shared.zip">http://www.hdfgroup.org/ftp/HDF5/current/bin/windows/HDF5189-win64-vs10-shared.zip</uri>)
+            should work fine with the Gadgetron. Remember to add the path
+            (e.g. <filename>C:\Program Files\HDF
+            Group\HDF5\1.8.9\bin</filename>) to the HDF libraries to your
+            <varname>PATH</varname> environment library.</para>
+          </listitem>
+
+          <listitem>
+            <para>Install CodeSynthesis XSD (<uri
+            xlink:href="http://www.codesynthesis.com/download/xsd/3.3/windows/i686/xsd-3.3.msi">http://www.codesynthesis.com/download/xsd/3.3/windows/i686/xsd-3.3.msi</uri>).</para>
+
+            <para>Remember to add the path to the XSD binaries to your
+            <varname>PATH</varname> environment variable. E.g.
+            <filename>C:\Program Files (x86)\CodeSynthesis XSD
+            3.3\bin\</filename> and <filename>C:\Program Files
+            (x86)\CodeSynthesis XSD 3.3\bin64\.</filename></para>
+          </listitem>
+
+          <listitem>
+            <para>Download, compile, and install the ISMRM Raw Data format.
+            Detailed instructions are available at (<uri
+            xlink:href="http://ismrmrd.sourceforge.net">http://ismrmrd.sourceforge.net</uri>).</para>
+
+            <para>From a git bash shell:</para>
+
+            <para><screen>git clone git://git.code.sf.net/p/ismrmrd/code ismrmrd-code
+cd ismrmrd-code/
+mkdir build
+cd build/
+cmake-gui.exe
+</screen>Last command will open CMake's graphical user interface. Hit the
+            configure button and deal with the dependencies that CMake is
+            unable to find. Hit configure again and repeat the process until
+            CMake has enough information to configure. Once the configuration
+            is complete, you can hit generate to generate a Visual Studio
+            project, which you can open and use to build ISMRMRD.</para>
+          </listitem>
+
+          <listitem>
+            <para>Download and unpack Gadgetron source code.</para>
+          </listitem>
+
+          <listitem>
+            <para>Create Visual Studio project (your process may vary):</para>
+
+            <para>Start <application>cmake-gui</application></para>
+
+            <para>Select source (<envar>$GADGETRON_HOME</envar>) and target
+            directories
+            (<envar>$GADGETRON_HOME</envar><filename>/build</filename>)</para>
+
+            <para>Hit configure (first time) -- "ok" the dialogue box.</para>
+
+            <para>Add PATH variable BOOST_ROOT to point to BOOST folder (use
+            GUI button "+Add Entry" to do this)</para>
+
+            <para>Hit configure (again)</para>
+
+            <para>Specify location of FFTW and FFTWf libraries</para>
+
+            <para>Hit configure (again)</para>
+
+            <para>Specify the locations of GLEW and GLUT:</para>
+
+            <para>Header files in <filename>C:\ProgramData\NVIDIA
+            Corporation\NVIDIA GPU COMPUTING SDK
+            4.1\shared\inc</filename></para>
+
+            <para>Library files in <filename>C:\ProgramData\NVIDIA
+            Corporation\NVIDIA GPU COMPUTING SDK 4.1\lib/x64</filename></para>
+
+            <para>Specify location of CULA (include path and core/lapack
+            library filepaths)</para>
+
+            <para>Specify location of ACE include directory</para>
+
+            <para>Hit configure (again)</para>
+
+            <para>Specify <varname>NUMPY_INCLUDE_DIRS</varname> =
+            <filename>C:/Python27/Lib/site-packages/numpy/core/include</filename></para>
+
+            <para>Hit configure (again)</para>
+
+            <para>Specify the following <envar>CMAKE </envar>FILEPATH
+            variables:</para>
+
+            <programlisting>
+BLAS_acml_LIBRARY= \
+  C:/Libraries/acml4.4.0/win64/lib/libacml_dll.lib
+
+BLAS_acml_mp_LIBRARY= \
+  C:/Libraries/acml4.4.0/win64_mp/lib/libacml_mp_dll.lib
+
+</programlisting>
+
+            <para>Hit configure</para>
+
+            <para>Make sure that the <envar>HDF5_C_LIBRARY</envar> and
+            <envar>HDF5_CXX_LIBRARY</envar> FILEPATH variables are set
+            correctly (we have observed that they might be incorrectly set to
+            point to the dll instead of the lib files by default).</para>
+
+            <para>Hit configure</para>
+
+            <para>Hit generate</para>
+          </listitem>
+
+          <listitem>
+            <para>You should now have a visual studio project that you can
+            open and build (try Release/x64 mode and try the install target).
+            If you are lacking sufficient write permission to install in the
+            default location, run Visual Studio as Administrator or change
+            <envar>CMAKE_INSTALL_PREFIX</envar> to a folder to which you have
+            write permissions. Notice that /gadgetron is automatically
+            appended to the path you specify.</para>
+          </listitem>
+
+          <listitem>
+            <para>After compiling and installing, please rename the file
+            <filename>GADGETRON_HOME/config/gadgetron.xml.example</filename>
+            to <filename>GADGETRON_HOME/config/gadgetron.xml</filename></para>
+          </listitem>
+        </itemizedlist>
+
+        <para>Before attempting to run any reconstructions, please set the
+        environment variable <varname>GADGETRON_HOME</varname> to point to the
+        installation folder of your Gadgetron installation and make sure that
+        the paths of all dependencies are in your <varname>PATH</varname>
+        environment variable.</para>
+
+        <para>You now have a working installation of the Gadgetron in Windows.
+        Follow the instructions below to run a simple reconstruction example
+        (<xref linkend="sect.simpleexample"/>).</para>
+      </sect2>
+    </sect1>
+
+    <sect1 xml:id="sect.simpleexample">
+      <title>Hello Gadgetron: Your First Image Reconstruction</title>
+
+      <para>Some basic sample datasets are available from the Sourceforge
+      website:</para>
+
+      <para><uri type="website"
+      xlink:href="https://sourceforge.net/projects/gadgetron/files/testdata/">https://sourceforge.net/projects/gadgetron/files/testdata/</uri></para>
+
+      <para>You will generally encounter two types of data in this manual: a)
+      Simple array format described in <xref linkend="simplearrayfiles"/> and
+      b) ISMRMRD HDF5 files which are described in more detail at <uri
+      xlink:href="http://ismrmrd.sourceforge.net">http://ismrmrd.sourceforge.net</uri>.
+      It is beyond the scope of this manual to explain the HDF5 file format,
+      but we have added a small introductory section in the appendix (<xref
+      linkend="section.hdf5"/>).</para>
+
+      <para>Download the file <filename>simple_gre.h5</filename> from the
+      website (on Linux simply type):</para>
+
+      <screen>wget http://sourceforge.net/projects/gadgetron/files/
+      testdata/ismrmrd/simple_gre.h5</screen>
+
+      <para>Open two terminal windows to observe both client and Gadgetron
+      communication output. In the Gadgetron terminal window simply
+      type:</para>
+
+      <screen><prompt>user at mycomputer:~/temp/gadgetron_out$</prompt> <userinput>gadgetron</userinput>  </screen>
+
+      <para>In the client window (in the folder where you just downloaded the
+      data) type:</para>
+
+      <screen><prompt>user at mycomputer:~/temp/test_data$</prompt> <userinput><command>mriclient \
+    -d simple_gre.h5 \
+    -c default.xml</command></userinput>
+</screen>
+
+      <para>You should now see some logging information both in the Gadgetron
+      window and in the client window. Specifically, you should see that a
+      connection is being made and when the reconstruction is done the client
+      should shut down:</para>
+
+      <screen><prompt>user at mycomputer:~/temp/test_data$</prompt> <userinput>mriclient \
+    -d simple_gre.h5 \  
+    -c default.xml</userinput>
+
+Gadgetron MRI Data Sender
+  -- host            :      localhost
+  -- port            :      9002
+  -- hdf5 file  in   :      gadgetron_testdata.h5
+  -- hdf5 group in   :      simple_gre
+  -- conf            :      default.xml
+  -- loop            :      1
+  -- hdf5 file out   :      ./out.h5
+  -- hdf5 group out  :      2012-05-11 12:52:14
+(31540|140170355443520) Connection from 127.0.0.1:9002
+31540, 81, GadgetronConnector, Close Message received
+(31540|140170283570944) Handling close...
+(31540|140170283570944) svc done...
+(31540|140170283570944) Handling close...
+</screen>
+
+      <para>The images are saved in the folder in which you started the
+      <application>mriclient</application>. The client appends the result to
+      an HDF5 file called <filename>out.h5</filename> (if no other file name
+      is specified). A group is created with the current time and data and the
+      images are stored in that group. If you run multiple reconstructions one
+      after another, the results will be added to the same file, but a new
+      group is created for each run. That makes it easy to compare results
+      from different reconstructions. The images are stored in a single
+      precision format as specified by the <filename>default.xml</filename>
+      configuration file. Please see <xref linkend="section.hdf5"/> for
+      details on how to read the output file. Briefly you could read and
+      display the data in Matlab with:</para>
+
+      <programlisting>images = h5read('out.h5','/<INSERT CORRECT DATE HERE>/image_0.img');
+imagesc(images(:,:,1,1));colormap(gray);</programlisting>
+    </sect1>
+  </chapter>
+
+  <chapter>
+    <title>Framework Overview</title>
+
+    <sect1>
+      <title>Gadgetron Streaming Architecture</title>
+
+      <para>The Gadgetron consists of a streaming processing architecture and
+      a set of toolboxes. The toolboxes are used within the streaming
+      components but come as individual shared libraries and can thus also be
+      used in standalone applications. The architecture is outlined in <xref
+      linkend="fig.gadgetron.architecture"/>.</para>
+
+      <figure xml:id="fig.gadgetron.architecture">
+        <title>Gadgetron Architecture</title>
+
+        <mediaobject>
+          <imageobject role="html">
+            <imagedata align="left" fileref="figs/architecture.png"
+                       format="PNG" width="10in"/>
+          </imageobject>
+
+          <imageobject role="fo">
+            <imagedata align="left" fileref="figs/architecture.png"
+                       format="PNG" width="5in"/>
+          </imageobject>
+
+          <textobject>
+            <phrase>Gadgetron Architecture</phrase>
+          </textobject>
+        </mediaobject>
+      </figure>
+
+      <para>The Gadgetron receives connections from clients through a TCP/IP
+      connection. A client can be any application from which you can open a
+      TCP/IP socket and send data. Once a connection to a client has been
+      established (see <xref linkend="sect.communicationprotocol"/>), the
+      Gadgetron will read data from the socket and pass it on down a chain of
+      processing steps. The responsibility of reading and writing packages on
+      the socket is dispatched to a set of Readers and Writers (see <xref
+      linkend="sect.readerswriters"/>). Each step in the processing chain is
+      implemented in a module or Gadget (see <xref linkend="sect.gadgets"/>).
+      A reconstruction process is defined by defining a chain of Gadgets. The
+      assembly of Gadgets is done dynamically at run-time (see <xref
+      linkend="sect.streamconfiguration"/>).</para>
+
+      <sect2 xml:id="sect.gadgets">
+        <title>Gadgets</title>
+
+        <para>A Gadget is the functional unit of the Gadgetron. You can think
+        of the Gadget as a device with an input and output. Data passes
+        through the device and is modified and/or transformed between input
+        and output. By wiring multiple Gadgets together you create a
+        reconstruction program. A schematic outline of a Gadget is seen in
+        <xref linkend="fig.gadgetron.gadget"/></para>
+
+        <figure xml:id="fig.gadgetron.gadget">
+          <title>Gadget</title>
+
+          <mediaobject>
+            <imageobject role="html">
+              <imagedata align="left" fileref="figs/gadget.png" format="PNG"
+                         width="6in"/>
+            </imageobject>
+
+            <imageobject role="fo">
+              <imagedata align="left" fileref="figs/gadget.png" format="PNG"
+                         width="3in"/>
+            </imageobject>
+
+            <textobject>
+              <phrase>Gadget</phrase>
+            </textobject>
+          </mediaobject>
+        </figure>
+
+        <para>The Gadget is an active object based on the
+        <classname>ACE_Task</classname> from the ACE library. It has its own
+        thread (or threads) of execution and an input queue where data is
+        placed for processing by either the Gadgetron framework or an upstream
+        Gadget.</para>
+
+        <para>The active thread(s) in the Gadget will pick up a data package
+        from the queue, and then pass it on to a virtual
+        <function>process</function>. An abbreviated version of the header
+        <filename>Gadget.h</filename> is seen below:</para>
+
+        <programlisting>class Gadget : public ACE_Task<ACE_MT_SYNCH>
+{
+
+public:
+   virtual int svc(void)
+   {
+      //Pick up package from queue
+     
+      //Call process
+      if (this->process(m) == -1) {
+         //Handle error
+      }
+      return 0;
+   }
+
+   //More function (left out for simplicity)
+
+protected:
+   virtual int process(ACE_Message_Block * m) = 0;
+
+   virtual int process_config(ACE_Message_Block * m) {
+      return 0;
+   }
+
+};</programlisting>
+
+        <para>The data package used by the <classname>ACE_Task</classname> is
+        the <classname>ACE_Message_Block</classname>, which is a very basic
+        block of data (essentially just a byte array). To allow the Gadgets to
+        check if the data blocks on the message queue are of the expected
+        type, the Gadgetron uses a modified
+        <classname>ACE_Message_Block</classname> called
+        <classname>GadgetContainerMessage</classname>, which can contain any
+        class with a no-argument constructor. It is possible to check if the
+        <classname>GadgetContainerMessage</classname> contains a specific type
+        of data, and if so, access that object. Suppose we want to store a
+        class named <classname>MyClass</classname>:</para>
+
+        <programlisting>GadgetContainerMessage<MyClass>* m = 
+  new GadgetContainerMessage<MyClass>();
+
+MyClass* mc = m->getObjectPtr();
+
+//Do something with mc
+
+m->release(); //Delete the message block and containing data</programlisting>
+
+        <para>When a function receives an
+        <classname>ACE_Message_Block</classname> it is possible to check if it
+        is of a certain type:</para>
+
+        <programlisting>int process(ACE_Message_Block* mb)
+{
+  
+  GadgetContainerMessage<MyClass>* m = 
+    AsContainerMessage<MyClass>(mb);
+
+  if (m) {
+    MyClass* mc = m->getObjectPtr();
+    
+    //Do something with mc
+
+  } else {
+    //Something went wrong, deal with error
+    return -1;
+  }
+
+  mb->release();
+
+  return 0;
+}</programlisting>
+
+        <para>It is possible to chain more than one
+        <classname>ACE_Message_Block</classname> together using the
+        <function>cont</function> function. This effectively provides a way to
+        pass multiple arguments into a Gadget and checking if they have the
+        appropriate types:</para>
+
+        <programlisting>int process(ACE_Message_Block* mb)
+{
+  
+  GadgetContainerMessage<MyClass>* m1 = 
+    AsContainerMessage<MyClass>(mb);
+
+  GadgetContainerMessage<MyOtherClass>* m2 = 
+    AsContainerMessage<MyOtherClass>(mb->cont());
+
+  if (m1 && m2) {
+    MyClass* mc = m1->getObjectPtr();
+    MyOtherClass* moc = m2->getObjectPtr();
+    
+    //Do something with mc
+
+  } else {
+    //Something went wrong, deal with error
+    return -1;
+  }
+
+  mb->release(); //This deletes both message blocks
+
+  return 0;
+}</programlisting>
+
+        <para>It gets a bit tedious and error prone to repeat code like the
+        above in every Gadget. To overcome this, the Gadgetron comes with a
+        set of templated classes to automate the steps. Say we would like to
+        make a Gadget which takes a single input argument, we would inherit
+        from <classname>Gadget1</classname>. If you need two arguments, you
+        inherit from <classname>Gadget2</classname>:</para>
+
+        <programlisting>template <class P1, class P2> class Gadget2 : public Gadget
+{
+protected:
+   int process(ACE_Message_Block* mb)
+   {
+     //Do type checking 
+   }
+
+   virtual int process(GadgetContainerMessage<P1>* m1, 
+     GadgetContainerMessage<P2>* m2) = 0;
+};</programlisting>
+
+        <para>The base class performs the type checking for you and only when
+        the arguments have been verified, it will call the virtual
+        <function>process</function> above. So, all you need to do in order to
+        implement a Gadget that takes two arguments is to implement this
+        function. As an example, let's look at a very simple Gadget, which
+        receives an image header (in ISMRM Raw Data format) and some image
+        data and does a Fourier transform of the first 3 dimensions. First the
+        header file <filename>FFTGadget.h</filename></para>
+
+        <programlisting>#include "gadgetroncore_export.h"
+#include "Gadget.h"
+#include "ismrmrd.h"
+#include "hoNDArray.h"
+#include <complex>
+
+class EXPORTGADGETSCORE FFTGadget : 
+public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+{
+ public:
+  GADGET_DECLARE(FFTGadget)
+
+ protected:
+  virtual int process( 
+     GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+     GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};</programlisting>
+
+        <para>Let us walk through the code step by step. The Gadget takes two
+        arguments: 1) <classname>GadgetMessageImage</classname>, which is just
+        a struct with some image header information (it is defined in
+        <filename>GadgetMRIHeaders.h</filename>), and 2) a
+        <classname>hoNDArray</classname>, which is a multidimensional array
+        (see <xref linkend="sect.ndarray"/>) storage container. In this case
+        the <classname>hoNDArray</classname> contains complex floating point
+        data.</para>
+
+        <para>There are a couple of other things to notice. One is the
+        <function>EXPORTGADGETSCORE</function> macro in the class definition.
+        This is needed to make things work properly on Windows. It is defined
+        in <filename>gadgetroncore_export.h</filename> and is used (on
+        Windows) to indicate if the class is being imported or exported from a
+        DLL. It translates into <function>__declspec(dllexport)</function> or
+        <function>__declspec(dllimport)</function> in Windows and is empty in
+        Linux/OSX. It is beyond the scope of this manual to go into why such a
+        declaration is needed, but keep this in mind when you start creating
+        your own Gadgets. Each shared library (DLL) has its own export
+        declaration macro.</para>
+
+        <para>The other thing to notice is the
+        <function>GADGET_DECLARE(FFTGadget)</function> macro. This macro is
+        required for Windows to correctly handle shared libraries and is
+        needed whenever you create a new Gadget to make things work properly
+        on Windows.</para>
+
+        <para>The actual implementation looks like this:</para>
+
+        <programlisting>#include "FFTGadget.h"
+#include "FFT.h"
+
+int FFTGadget::process( 
+   GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+  FFT<float>::instance()->ifft(m2->getObjectPtr(),0);
+  FFT<float>::instance()->ifft(m2->getObjectPtr(),1);
+  FFT<float>::instance()->ifft(m2->getObjectPtr(),2);
+
+  if (this->next()->putq(m1) < 0) {
+     return GADGET_FAIL;
+  }
+
+  return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(FFTGadget)</programlisting>
+
+        <para>Once we are inside the <function>process</function> function,
+        the data has already been converted to the appropriate container
+        messages and we can start processing the data. This function uses an
+        FFT toolbox (more on toolboxes in <xref linkend="sect.toolboxes"/>).
+        After the data has been Fourier transformed along the first 3
+        dimensions it is placed on the next Gadgets queue. Remember the two
+        <classname>GadgetContainerMessage</classname> objects were originally
+        picked up from the message queue as a chain of
+        <classname>ACE_Message_Block</classname> objects. They are still
+        chained together, i.e. when passing <varname>m1</varname> on to the
+        next Gadget we are effectively passing on both arguments.</para>
+
+        <para>Another couple of macros to notice are the
+        <varname>GADGET_OK</varname> and <varname>GADGET_FAIL</varname>. They
+        are defined as 0 and -1 respectively. The convention in the Gadgetron
+        is to return 0 when a function succeeds and < 0 when it fails -
+        unless the function returns a pointer.</para>
+
+        <para>Last thing to notice is the
+        <function>GADGET_FACTORY_DECLARE(FFTGadget)</function> statement. This
+        is a macro which declares functions for loading a Gadget of this type
+        out of a shared library and destroying it again when we are done. It
+        ensures that we can load the Gadget on all platforms. When you create
+        your own gadgets you should use this macro to declare the factory
+        function for the Gadget.</para>
+
+        <para>For a tutorial on how to make your own Gadget library see <xref
+        linkend="sect.makingnewgadgetlibrary"/>.</para>
+
+        <sect3 xml:id="sect.xmlparameters">
+          <title>Gadget XML Configuration</title>
+
+          <para>In addition to defining a Gadget's behavior in response to a
+          data package, it is also possible for the Gadgets to receive
+          configuration information or parameters. The user can define the
+          Gadgets behavior in response to configuration information by
+          implementing the <function>process_config</function> function in the
+          Gadget header file. The configuration information or parameters is
+          typically transmitted in the beginning of the reconstruction process
+          from the client (see <xref linkend="sect.communicationprotocol"/>).
+          The configuration information can in principle be in any format (a
+          given application can use a binary format or a text format defined
+          for the specific purpose), but conventionally the parameters are
+          transmitted in XML format and for the MRI Gadgets, the XML
+          configuration is the XML header from the ISMRM Raw Data file. More
+          details on this format and how to easily parse it with the included
+          C++ XML data binding classes can be found at
+          <uri>http://ismrmrd.sourceforge.net</uri>.</para>
+
+          <para>An example of a parameter XML file for an MRI data set is
+          shown here:</para>
+
+          <programlisting><?xml version="1.0"?>
+
+<ismrmrdHeader xmlns="http://www.ismrm.org/ISMRMRD" 
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+  xmlns:xs="http://www.w3.org/2001/XMLSchema" 
+  xsi:schemaLocation="http://www.ismrm.org/ISMRMRD ismrmrd.xsd">
+
+  <subjectInformation>
+    <patientName>phantom</patientName>
+    <patientWeight_kg>72.5748</patientWeight_kg>
+  </subjectInformation>
+  <acquisitionSystemInformation>
+    <systemVendor>SIEMENS</systemVendor>
+    <systemModel>Avanto</systemModel>
+    <systemFieldStrength_T>1.494</systemFieldStrength_T>
+    <receiverChannels>32</receiverChannels>
+    <relativeReceiverNoiseBandwidth>0.79</relativeReceiverNoiseBandwidth>
+  </acquisitionSystemInformation>
+  <experimentalConditions>
+    <H1resonanceFrequency_Hz>63620740</H1resonanceFrequency_Hz>
+  </experimentalConditions>
+  <encoding>
+    <trajectory>cartesian</trajectory>
+    <encodedSpace>
+      <matrixSize>
+        <x>256</x>
+        <y>128</y>
+        <z>1</z>
+      </matrixSize>
+      <fieldOfView_mm>
+        <x>600</x>
+        <y>300</y>
+        <z>5</z>
+      </fieldOfView_mm>
+    </encodedSpace>
+    <reconSpace>
+      <matrixSize>
+        <x>128</x>
+        <y>128</y>
+        <z>1</z>
+      </matrixSize>
+      <fieldOfView_mm>
+        <x>300</x>
+        <y>300</y>
+        <z>5</z>
+      </fieldOfView_mm>
+    </reconSpace>
+    <encodingLimits>
+      <kspace_encoding_step_1>
+        <minimum>0</minimum>
+        <maximum>127</maximum>
+        <center>64</center>
+      </kspace_encoding_step_1>
+      <kspace_encoding_step_2>
+        <minimum>0</minimum>
+        <maximum>0</maximum>
+        <center>0</center>
+      </kspace_encoding_step_2>
+      <slice>
+        <minimum>0</minimum>
+        <maximum>0</maximum>
+        <center>0</center>
+      </slice>
+      <set>
+        <minimum>0</minimum>
+        <maximum>0</maximum>
+        <center>0</center>
+      </set>
+    </encodingLimits>
+  </encoding>
+  <sequenceTiming>
+    <TR>5.86</TR>
+    <TE>2.96</TE>
+  </sequenceTiming>
+</ismrmrdHeader></programlisting>
+
+          <para>The user/developer can use any XML parsing technique to
+          extract parameters from this XML header, but we encourage developers
+          to use the C++ XML Data Binding classes that are included with the
+          ISMRM Raw Data C++ library. For example, to parse encoding limits
+          (example from <filename>AccumulatorGadget.cpp</filename>):</para>
+
+          <programlisting>int AccumulatorGadget::process_config(ACE_Message_Block* mb)
+{
+ 
+ //Calling parsing convenience function found in GadgetIsmrmrdReadWrite.cpp
+ boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+ ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding(); 
+ if (e_seq.size() != 1) {
+  GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+  GADGET_DEBUG1("Only supports one encoding space supported\n");
+  return GADGET_FAIL;
+ }
+
+ ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+ ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+ ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+ GADGET_DEBUG2("Matrix size: %d, %d, %d\n", 
+   e_space.matrixSize().x(), 
+   e_space.matrixSize().y(), 
+   e_space.matrixSize().z());
+
+ dimensions_.push_back(e_space.matrixSize().x());
+ dimensions_.push_back(e_space.matrixSize().y());
+ dimensions_.push_back(e_space.matrixSize().z());
+
+ slices_ = e_limits.slice().present() ? 
+   e_limits.slice().get().maximum()+1 : 1;
+
+  return GADGET_OK;
+}</programlisting>
+        </sect3>
+      </sect2>
+
+      <sect2 xml:id="sect.readerswriters">
+        <title>Readers and Writers</title>
+
+        <para>As illustrated in <xref linkend="fig.gadgetron.architecture"/>
+        the Gadgetron uses a set of Readers and Writers to deal with the
+        incoming communication on the TCP/IP socket. Readers are responsible
+        for deserialization of packages and Writers are responsible for
+        serialization of packages. All packages that arrive on the socket will
+        start with a message ID. Based on this ID, the Gadgetron delegates the
+        responsibility of reading the package of the socket to a particular
+        instance of a <classname>GadgetMessageReader</classname> defined by
+        the following abstract class:</para>
+
+        <programlisting>class GadgetMessageReader
+{
+ public:
+  virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) = 0;
+};</programlisting>
+
+        <para>In order to be able to read a specific type of data, the
+        <function>read</function> function must be implemented for that data
+        type. As an example here is the
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>, which
+        reads an MRI data acquisition from the socket.</para>
+
+        <programlisting>class GadgetIsmrmrdAcquisitionMessageReader 
+: public GadgetMessageReader
+{
+ public:
+  GADGETRON_READER_DECLARE(GadgetIsmrmrdAcquisitionMessageReader);
+  virtual ACE_Message_Block* read(ACE_SOCK_Stream* socket);
+};</programlisting>
+
+        <para>Note the
+        <function>GADGETRON_READER_DECLARE(GadgetIsmrmrdAcquisitionMessageReader)</function>
+        declaration. This is equivalent to the declaration needed for the
+        Gadgets (see <xref linkend="sect.gadgets"/>) in order to make them
+        load properly from shared libraries.</para>
+
+        <para>The implementation of this particular reader is as follows (this
+        is an abbreviated version without error checking, etc.):</para>
+
+        <programlisting>ACE_Message_Block* GadgetIsmrmrdAcquisitionMessageReader::read(ACE_SOCK_Stream* sock)
+{
+ GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1 =
+   new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+
+ GadgetContainerMessage<hoNDArray< std::complex<float> > >* m2 =
+   new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+ m1->cont(m2);
+
+ ssize_t recv_count = 0;
+
+ if ((recv_count = stream->recv_n(m1->getObjectPtr(), sizeof(ISMRMRD::AcquisitionHeader))) <= 0) {
+  m1->release();
+  return 0;
+ }
+
+ if (m1->getObjectPtr()->trajectory_dimensions) {
+  GadgetContainerMessage<hoNDArray< float > >* m3 =
+    new GadgetContainerMessage< hoNDArray< float > >();
+
+ m2->cont(m3);
+
+ std::vector<unsigned int> tdims;
+ tdims.push_back(m1->getObjectPtr()->trajectory_dimensions);
+ tdims.push_back(m1->getObjectPtr()->number_of_samples);
+
+ if (!m3->getObjectPtr()->create(&tdims)) {
+  m1->release();
+  return 0;
+ }
+
+ if ((recv_count =
+   stream->recv_n
+    (m3->getObjectPtr()->get_data_ptr(),
+     sizeof(float)*tdims[0]*tdims[1])) <= 0) {
+
+     m1->release();
+
+   return 0;
+ }
+
+ std::vector<unsigned int> adims;
+ adims.push_back(m1->getObjectPtr()->number_of_samples);
+ adims.push_back(m1->getObjectPtr()->active_channels);
+
+ if (!m2->getObjectPtr()->create(&adims)) {
+   m1->release();
+   return 0;
+ }
+
+ if ((recv_count =
+      stream->recv_n
+      (m2->getObjectPtr()->get_data_ptr(),
+      sizeof(std::complex<float>)*adims[0]*adims[1])) <= 0) {
+
+    m1->release();
+
+    return 0;
+ }
+
+return m1;
+}
+
+GADGETRON_READER_FACTORY_DECLARE(GadgetIsmrmrdAcquisitionMessageReader)</programlisting>
+
+        <para>The Reader allocates two
+        <classname>GadgetContainerMessage</classname> data blocks to contain
+        the incoming data. First an MRI acquisition header (defined in
+        <filename>GadgetMRIHeaders.h</filename>) is read. Based hereon the
+        length of each acquisition (number of samples) and the number of
+        acquisition channels are determined. A
+        <classname>hoNDArray</classname> is allocated to store the data read
+        from the socket. Notice that the two
+        <classname>GadgetContainerMessage</classname> are chained together
+        using the <function>cont</function> function.</para>
+
+        <para>A final important statement to notice is:</para>
+
+        <programlisting>GADGETRON_READER_FACTORY_DECLARE(GadgetIsmrmrdAcquisitionMessageReader)</programlisting>
+
+        <para>This macro declares create and destroy functions to load the
+        reader from a shared library on all platforms supported.</para>
+
+        <para>Whereas the Readers are responsible for deserialization, the
+        <classname>GadgetMessageWriter</classname> is responsible for the
+        opposite operation (serialization). In practice, Gadgets that produce
+        an output for the client application can hand that data back to the
+        Gadgetron framework where it is placed on the output queue along with
+        a message ID. This is for instance done in this (abbreviated) code
+        from an <classname>ImageFinishGadget</classname>:</para>
+
+        <programlisting>template <typename T>
+int ImageFinishGadget<T>
+::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+   GadgetContainerMessage< hoNDArray< T > >* m2)
+{
+  if (!this->controller_) {
+    return -1;
+  }
+
+  GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+    new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+  switch (sizeof(T)) {
+  case 2: //Unsigned short
+   mb->getObjectPtr()->id = 
+      GADGET_MESSAGE_IMAGE_REAL_USHORT;
+   break;
+  case 4: //Float
+   mb->getObjectPtr()->id = 
+      GADGET_MESSAGE_IMAGE_REAL_FLOAT;
+   break;
+  case 8: //Complex float
+   mb->getObjectPtr()->id = 
+      GADGET_MESSAGE_IMAGE_CPLX_FLOAT;
+   break;
+  default:
+   GADGET_DEBUG2("Wrong data size detected: %d\n", sizeof(T));
+   mb->release();
+   m1->release();
+   return GADGET_FAIL;
+  }
+
+  mb->cont(m1);
+
+  int ret =  this->controller_->output_ready(mb);
+
+  if ( (ret < 0) ) {
+   GADGET_DEBUG1("Failed to return massage to controller\n");
+   return GADGET_FAIL;
+  }
+
+  return GADGET_OK;
+}</programlisting>
+
+        <para>Notice that the Gadget has a reference to the Gadgetron
+        framework through the <varname>controller_</varname> member variable,
+        which is set during initialization.</para>
+
+        <para>In the framework (more specifically in the
+        <classname>GadgetStreamController</classname>) there is an active
+        thread responsible for writing messages that are put on to the output
+        queue. This is done by investigating the message ID and then picking
+        the <classname>GadgetMessageWriter</classname> associated with this
+        ID. A Writer must implement the following abstract class:</para>
+
+        <programlisting>class GadgetMessageWriter
+{
+ public:
+  virtual int write(ACE_SOCK_Stream* stream, 
+                    ACE_Message_Block* mb) = 0;
+};</programlisting>
+
+        <para>The Writer is handed control of the socket along with the
+        message block. A Writer declaration could look like:</para>
+
+        <programlisting>class MRIImageWriter 
+  : public GadgetMessageWriter
+{
+
+public:
+   GADGETRON_WRITER_DECLARE(MRIImageWriter);
+   virtual int write(ACE_SOCK_Stream* sock, 
+                     ACE_Message_Block* mb);
+};</programlisting>
+
+        <para>Notice again the
+        <function>GADGETRON_WRITER_DECLARE(MRIImageWriter)</function> which
+        ensures proper run-time linking behavior. The implementation could
+        look like (abbreviated with no error checking, etc.):</para>
+
+        <programlisting>int MRIImageWriter
+     ::write(ACE_SOCK_Stream* sock, 
+             ACE_Message_Block* mb)
+{
+
+   GadgetContainerMessage<ISMRMRD::ImageHeader>* imagemb = 
+      AsContainerMessage<ISMRMRD::ImageHeader>(mb);
+  
+   GadgetContainerMessage< hoNDArray< float > >* datamb =
+      AsContainerMessage< hoNDArray< float > >(imagemb->cont());
+  
+   if (!datamb || !imagemb) {
+      //Deal with errors
+   }
+   
+   GadgetMessageIdentifier id;
+   //Example for real flow image.
+   id.id = GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT; 
+ 
+   sock->send_n (&id, sizeof(GadgetMessageIdentifier));
+
+   sock->send_n (imagemb->getObjectPtr(), sizeof(ISMRMRD::ImageHeader));
+
+   sock->send_n (datamb->getObjectPtr()->get_data_ptr(), 
+      sizeof(float)*datamb->getObjectPtr()->get_number_of_elements());
+
+   return 0;
+}
+
+GADGETRON_WRITER_FACTORY_DECLARE(MRIImageWriter)</programlisting>
+
+        <para>Once again notice the required
+        <function>GADGETRON_WRITER_FACTORY_DECLARE(MRIImageWriter)</function>
+        macro. Also notice that the message ID is transmitted to the client.
+        The client is expected to follow the same communication model as the
+        Reader, but it is determined entirely by the Writer implementation how
+        the message is transmitted.</para>
+
+        <para>Readers and Writers are loaded dynamically at run-time along
+        with the Gadgets (see <xref linkend="sect.streamconfiguration"/>). The
+        input and output behaviour can be adapted by manipulating which
+        Readers and Writers are associated with which message IDs.</para>
+      </sect2>
+
+      <sect2 xml:id="sect.streamconfiguration">
+        <title>Stream Configuration</title>
+
+        <para>A Gadgetron reconstruction is made up of modules, i.e. Readers,
+        Writers, and Gadgets. New reconstruction programs can be created by
+        simply assembling existing components in a new way. The configuration
+        of the Gadgetron stream is done at run-time and new configuration
+        chains can be created without recompiling any of the underlying
+        Gadgets. More specifically, the configuration is specified in an XML
+        file that the Gadgetron will read before receiving data. The best way
+        to explain the format is by looking at a (simplified) example:</para>
+
+        <programlisting><?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration 
+  xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+  xmlns="http://gadgetron.sf.net/gadgetron"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetroncore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetroncore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetroncore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetroncore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetroncore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration></programlisting>
+
+        <para>The stream configuration XML layout is defined in the
+        <filename>GADGETRON_HOME/schema/gadgetron.xsd</filename>. A stream
+        configuration must conform to this schema definition or an error will
+        be generated when the Gadgetron attempts to load the
+        configuration.</para>
+
+        <para>The configuration file format contains 3 sections: 1) Readers,
+        2) Writers, 3) Stream (with Gadgets) corresponding to the 3 different
+        types of components that can be assembled in the Gadgetron.</para>
+
+        <para>In the example above, the Readers section contains only one
+        reader, which is the
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname> mentioned
+        previously. The message ID associated with this Reader is 1008. Every
+        time a message with ID 1008 arrives on the socket, responsibility for
+        reading the message will be delegated to the
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>. When the
+        Gadgetron configuration is loaded, the framework will load the
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname> from the
+        DLL (shared library) <filename>gadgetroncore</filename>. On the Linux
+        platform this would be a shared library called
+        <filename>libgadgetroncore.so</filename> and on the Windows platform
+        it would be called <filename>gadgetroncore.dll</filename>.</para>
+
+        <para>The Gadgetron framework knows how to load the components from
+        the DLLs assuming that they have been declared properly as described
+        in <xref linkend="sect.readerswriters"/> and <xref
+        linkend="sect.gadgets"/>.</para>
+
+        <para>The example Gadgetron configuration has two Writers, i.e. it is
+        capable of outputting two different types of data. Again the
+        declarations cause the Gadgetron framework to load specific instances
+        of <classname>GadgetMessageWriter</classname> and associate them with
+        specific ID numbers.</para>
+
+        <para>There are certain built-in Readers and Writers in addition to
+        those specified in the configuration file. As an example, there are
+        Readers for receiving configurations to be used by the Gadgetron and
+        for receiving the parameters that will be passed to all Gadgets (see
+        <xref linkend="sect.communicationprotocol"/>). If the Gadgetron
+        receives a message with an ID for which there is no associated Reader
+        or encounters a message on the output queue for which there is no
+        associated Writer an error will be generated, the Gadgetron stream
+        shuts down, and the connection to the client will be closed.</para>
+
+        <para>In the example above, we have 4 Gadgets in the reconstruction
+        chain. The first Gadget is an
+        <classname>AccumulatorGadget</classname>, which collects individual
+        lines and inserts them in k-space. When the k-space image is complete
+        it is sent to the next Gadget in the chain, the
+        <classname>FFTGadget</classname>, which is responsible for Fourier
+        transforming the data into image space. The next Gadget
+        (<classname>ExtractGadget</classname>) will extract the magnitude of
+        the complex image. Finally the last Gadget in the chain
+        (<classname>ImageFinishGadgetFLOAT</classname>) sends the
+        reconstructed image back to the Gadgetron framework where it is added
+        to the output queue.</para>
+
+        <para>It is also possible to send configuration parameters to Gadgets
+        using the XML file. For example, to set a parameter in a Gadget, one
+        could write:</para>
+
+        <programlisting>  <gadget>
+   <name>Accumulator</name>
+   <dll>gadgetroncore</dll>
+   <classname>AccumulatorGadget</classname>
+   <property><name>MyTestProperty</name>
+   <value>Blah Blah</value></property>
+   <property><name>MyTestProperty2</name>
+   <value>98776.862187</value></property>
+  </gadget>
+</programlisting>
+
+        <para>The two properties will now be accessible inside the Gadget
+        using the parameter access functions defined in
+        <filename>Gadget.h</filename>:</para>
+
+        <programlisting>class Gadget : public ACE_Task<ACE_MT_SYNCH>
+{
+
+//Other definitions
+
+int get_bool_value(const char* name);
+int get_int_value(const char* name);
+double get_double_value(const char* name);
+
+};</programlisting>
+
+        <para>Additionally it is also possible to specify how many active
+        threads there should be in a Gadget. This is specified with:</para>
+
+        <programlisting>  <gadget>
+   <name>Accumulator</name>
+   <dll>gadgetroncore</dll>
+   <classname>AccumulatorGadget</classname>
+   <property><name>threads</name><value>5</value></property>
+  </gadget></programlisting>
+
+        <para>Which would make the <classname>AccumulatorGadget</classname>
+        have 5 threads.</para>
+      </sect2>
+
+      <sect2 xml:id="sect.communicationprotocol">
+        <title>Communication Sequence</title>
+
+        <para>Communication between a client and the Gadgetron follows a
+        straightforward communication protocol. When the Gadgetron is started
+        it will be expecting a connection on a specific port (port 9002 is the
+        default). The communication sequence is as follows:</para>
+
+        <orderedlist>
+          <listitem>
+            <para>The client makes connection</para>
+          </listitem>
+
+          <listitem>
+            <para>The Gadgetron accepts the connection and creates a new
+            instance of a <classname>GadgetStreamController</classname> (see
+            <xref linkend="fig.gadgetron.architecture"/>). After creating the
+            <classname>GadgetStreamController</classname> the Gadgetron
+            returns to accept connections on the socket such that multiple
+            clients can be connected simultaneously.</para>
+          </listitem>
+
+          <listitem>
+            <para><classname>The GadgetStreamController</classname> takes
+            control of the socket and expects to read a specific type of
+            message, which either contains the filename of a specific stream
+            configuration (see <xref linkend="sect.streamconfiguration"/>) or
+            alternatively it can receive the actual XML stream specification
+            directly on the socket. These two types of messages are read with
+            Readers that are always registered for the Gadgetron (see <xref
+            linkend="sect.readerswriters"/>). If the Gadgetron receives the
+            filename of a Gadget stream it expects to be able to find that
+            configuration file in the <filename>gadegtron/config</filename>
+            folder (see <xref linkend="sect.fileorganization"/>).</para>
+          </listitem>
+
+          <listitem>
+            <para>The <classname>GadgetStreamController</classname> is then
+            expecting to receive parameters that will be transmitted to each
+            individual Gadget. In principle the "parameters" is just a raw
+            buffer of characters that will be transmitted as such to each
+            individual Gadget. It is the convention however to send the
+            parameters in an XML format. It is up to each individual Gadget to
+            interpret the parameters. The user can implement any behavior in
+            response to the parameters by implementing the
+            <function>process_config</function> function (see <xref
+            linkend="sect.gadgets"/>). The client can send parameters at any
+            time during a reconstruction and they will always be transmitted
+            to all Gadgets through the <function>process_config</function>
+            function.</para>
+          </listitem>
+
+          <listitem>
+            <para>The client then starts transmitting data packages that the
+            Gadgetron processes. Images are returned to the client.</para>
+          </listitem>
+
+          <listitem>
+            <para>When the client has no more data it will send a closure
+            package. This package causes all Gadgets (in order) to process all
+            remaining data on their input queue and then shut down.</para>
+          </listitem>
+
+          <listitem>
+            <para>Once the final Gadget has shut down, the connection with the
+            client is terminated.</para>
+          </listitem>
+        </orderedlist>
+
+        <para>To make it easier to create a new client, the Gadgetron comes
+        with a <classname>GadgetronConnector</classname> class:</para>
+
+        <programlisting>class GadgetronConnector: 
+  public ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_MT_SYNCH> {
+
+public:
+
+ int open (std::string hostname, std::string port);   
+ int putq  (ACE_Message_Block * mb ,  
+     ACE_Time_Value *  timeout = 0);
+
+ int register_reader(unsigned int slot, 
+     GadgetMessageReader* reader);
+
+ int register_writer(unsigned int slot, 
+     GadgetMessageWriter* writer);
+
+ int send_gadgetron_configuration_file(std::string config_xml_name);   
+ int send_gadgetron_configuration_script(std::string config_xml_name);
+ int send_gadgetron_parameters(std::string xml_string);
+};</programlisting>
+
+        <para>This class can be used to create simple clients that open a
+        connection with the Gadgetron using the <function>open</function>
+        function and then communicate with the Gadgetron through the Readers
+        and Writers registered with the connector. See the
+        <application>mriclient</application> example application
+        (<filename>gagetron/apps/clients/mriclient</filename> in the source
+        code archive) for a simple example of how to build a Gadgetron
+        client.</para>
+      </sect2>
+
+      <sect2 xml:id="sect.fileorganization">
+        <title>File Organization</title>
+
+        <para>This section provides a brief overview of the file organization
+        in the Gadgetron installation. Once you have compiled the Gadgetron
+        and installed it (see <xref linkend="sect.installation"/>), it will
+        reside in its designated installation folder
+        (<varname>GADGETRON_HOME</varname>). For the purposes of this
+        description, we will assume that the Gadgetron was installed in
+        <filename>/usr/local/gadgetron</filename>.</para>
+
+        <para>In <varname>GADGETRON_HOME</varname> you should find the
+        following folders:</para>
+
+        <para><itemizedlist>
+            <listitem>
+              <para><filename>bin</filename>: Contains all executables from
+              the Gadgetron framework including the
+              <application>gadgetron</application> executable itself and all
+              clients and standalone applications.</para>
+            </listitem>
+
+            <listitem>
+              <para><filename>config</filename>: Contains Gadgetron XML
+              configuration files (see <xref
+              linkend="sect.streamconfiguration"/>). This is where the
+              Gadgetron searches for the configurations requested by the
+              clients during initialization of the Gadget chain (see <xref
+              linkend="sect.communicationprotocol"/>). It also contains the
+              global <filename>gadgetron.xml</filename> configuration file,
+              which is used to set global configuration parameters such as the
+              port number for the Gadgetron.</para>
+            </listitem>
+
+            <listitem>
+              <para><filename>lib:</filename> Contains all shared libraries
+              (Gadgets and toolboxes). Additionally, this is the default path
+              where Python Gadgets look for Python modules.</para>
+            </listitem>
+
+            <listitem>
+              <para><filename>include</filename>: Contains all header files
+              for the Gadgets and Toolboxes in order that they can be linked
+              into external applications and Gadget libraries compiled outside
+              the Gadgetron source tree.</para>
+            </listitem>
+
+            <listitem>
+              <para><filename>schema</filename>: Contains all the XML schema
+              definitions used by the Gadgetron (e.g.
+              <filename>gadgetron.xsd</filename>) and also serves as a
+              container for schema files used by client applications and
+              copied to this folder during installation.</para>
+            </listitem>
+
+            <listitem>
+              <para><filename>cmake:</filename> Contains a set of helpful
+              CMake scripts that can be used if you wish to build applications
+              or Gadget libraries outside the Gadgetron source tree. Among
+              other things it contains a
+              <filename>FindGadgetron.cmake</filename> script, which can be
+              used to localize and set paths for the Gadgetron using
+              CMake.</para>
+            </listitem>
+          </itemizedlist></para>
+      </sect2>
+    </sect1>
+
+    <sect1 xml:id="sect.toolboxes">
+      <title>Gadgetron Toolboxes</title>
+
+      <para>The core reconstruction data structures and algorithms are made
+      available through a set of toolboxes in shared libraries. The toolboxes
+      implement the functionality of the various Gadgets, but they can also be
+      used in standalone applications. A non-exhaustive overview of key
+      functionality is covered in the following sections.</para>
+
+      <sect2 xml:id="sect.ndarray">
+        <title><classname>NDArray</classname></title>
+
+        <para>Most image processing operations involve multi-dimensional
+        arrays. Although the Gadgetron framework does not impose any specific
+        array structure on the user, it does come with an abstract
+        multi-dimensional array used throughout: the
+        <classname>NDArray</classname>. It has a specific implementation for
+        the CPU (<classname>hoNDArray</classname>) and GPU
+        (<classname>cuNDArray</classname>). The abstract class definition
+        looks like (abbreviated version):</para>
+
+        <programlisting>template <class T> class NDArray
+{
+ public:
+  
+  NDArray ();
+
+  virtual ~NDArray();
+  
+  virtual T* create(std::vector<unsigned int> *dimensions); 
+
+  virtual T* create(std::vector<unsigned int> *dimensions, 
+                    T* data, bool delete_data_on_destruct = false);
+
+  virtual int permute(std::vector<unsigned int> *dim_order,
+                      NDArray<T> *out = 0, int shift_mode = 0);
+  
+  inline unsigned int get_number_of_dimensions() const {
+    return dimensions_->size();
+  }
+
+  unsigned int get_size(unsigned int dimension);
+
+  boost::shared_ptr< std::vector<unsigned int> > get_dimensions();
+  
+  inline T* get_data_ptr() const { 
+    return data_; 
+  }
+  
+  inline unsigned long int get_number_of_elements() const {
+    return elements_;
+  }
+
+  //Other public functions...
+
+protected:
+
+  virtual int allocate_memory() = 0;
+  virtual int deallocate_memory() = 0;
+
+  //Other private functions
+  
+};</programlisting>
+
+        <para>The CPU (host) definition would look like (abbreviated):</para>
+
+        <programlisting>template <class T> class hoNDArray : public NDArray<T>
+{
+
+public:
+   //Public functions...
+
+protected:
+   virtual int allocate_memory();
+   virtual int deallocate_memory();
+};</programlisting>
+
+        <para>As is seen from the <classname>NDArray</classname> header file,
+        this class has a no-argument constructor, which makes it suited for
+        encapsulating in the <classname>GadgetContainerMessage</classname>
+        mentioned in <xref linkend="sect.gadgets"/>. The procedure for
+        creating an array with complex float values would look something like
+        this:</para>
+
+        <programlisting>#include <hoNDArray.h>
+#include <complex>
+
+hoNDArray< std::complex<float> > myArray;
+
+std::vector<unsigned int> dimensions;
+dimensions.push_back(128);
+dimensions.push_back(128);
+
+if(!myArray.create(&dimensions)) {
+   //Deal with errors
+}
+
+//process data</programlisting>
+
+        <para>To create an <classname>NDArray</classname> contained in a
+        <classname>GadgetContainerMessage</classname> would look something
+        like this:</para>
+
+        <programlisting>GadgetContainerMessage< hoNDArray< std::complex<float> > >* m = 
+  new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+std::vector<unsigned int> dimensions;
+dimensions.push_back(128);
+dimensions.push_back(128);
+
+if(!m->getObjectPtr()->create(&dimensions)) {
+   //Deal with errors
+}
+
+//Process data or pass on to other Gadget, etc. 
+
+m->release(); //Delete the message block and containing data
+</programlisting>
+
+        <para>As mentioned in <xref linkend="sect.gadgets"/>, the
+        <classname>GadgetContainerMessage</classname> is a specialized version
+        of the <classname>ACE_Message_Block</classname> class from the ACE
+        framework. Data is passed between Gadgets in the form of
+        <classname>ACE_Message_Block</classname>s and Gadgets have access to
+        utility functions that allow them to test if a given
+        <classname>ACE_Message_Block</classname> is in fact a particular type
+        of <classname>GagetContainerMessage</classname>.</para>
+
+        <sect3>
+          <title>GPU Support</title>
+
+          <para>The <classname>NDArray</classname> data structure also has a
+          GPU implementation (abbreviated version of header below):</para>
+
+          <programlisting>template <class T> class cuNDArray : public NDArray<T>
+{
+ public:
+  cuNDArray();
+
+  cuNDArray(const cuNDArray<T>& a);
+
+  // Constructor from hoNDArray
+  cuNDArray(hoNDArray<T> *a);
+
+  // Assignment operator
+  cuNDArray& operator=(const cuNDArray<T>& rhs);
+  
+  virtual ~cuNDArray();
+
+  virtual T* create(std::vector<unsigned int> *dimensions);
+
+  virtual T* create(std::vector<unsigned int> *dimensions, 
+                    int device_no);
+
+  virtual T* create(std::vector<unsigned int> *dimensions, 
+                    T* data, bool delete_data_on_destruct = false);
+
+  virtual boost::shared_ptr< hoNDArray<T> > to_host() const;
+  
+  virtual int set_device(int device_no);
+  inline int get_device() { return device_; }
+  
+ protected:
+  
+  int device_; 
+  virtual int allocate_memory();
+  virtual int deallocate_memory();
+  
+};</programlisting>
+
+          <para>It has a few extra <function>create</function> functions
+          compared to the host (CPU) version of this array. Specifically, it
+          is possible to provide the array with the device number that the
+          array should be allocated on. This is important when working on
+          systems with multiple GPU processors. The default is to allocate it
+          on the current device (device 0 unless specifically set otherwise).
+          It is possible to query on which device the data is allocated and to
+          effectively move the data from one device to another through
+          operators. Similarly, one copy constructor takes a
+          <classname>hoNDArray</classname> and transparently copies the host
+          data to the GPU.</para>
+        </sect3>
+      </sect2>
+
+      <sect2>
+        <title><classname>vector_td</classname></title>
+
+        <para>The class <classname>vector_td</classname> provides a basic
+        representation of one-, two-, three-, or four-dimensional vectors
+        (positions). It is templetized with the datatype <varname>T</varname>
+        and dimensionality <varname>D</varname>. For convenience we provide a
+        set of typedefs to commonly encountered instances. A subset of the
+        definitions provided in <filename>vector_td.h</filename> is provided
+        here (users should check the actual file e.g. for additional often
+        used constructors):</para>
+
+        <programlisting>
+template< class T, unsigned int D > class vector_td
+{
+public:
+
+  T vec[D];
+
+  __inline__ __host__ __device__ T& operator[](const int i) 
+  {
+    return vec[i];
+  }
+
+  __inline__ __host__ __device__ const T& operator[](const int i) const
+  { 
+    return vec[i];
+  }
+};
+
+
+// Some typedefs for convenience
+
+template< class REAL, unsigned int D > struct reald{
+  typedef vector_td< REAL, D > Type;
+};
+
+template< unsigned int D > struct intd{
+  typedef vector_td< int, D > Type;
+};
+
+template< unsigned int D > struct uint64d{
+  typedef vector_td< unsigned int, D > Type;
+};
+
+template< unsigned int D > struct floatd{
+ typedef typename reald< float, D >::Type Type;
+};
+
+template< unsigned int D > struct doubled{
+  typedef typename reald< double, D >::Type Type;
+};
+
+template< class T > struct complext{
+  typedef vector_td< T, 2 > Type;
+};
+
+</programlisting>
+
+        <para>A number of arithmetic and conditional operators on the
+        <classname>vector_td</classname> are defined in
+        <filename>vector_td_operators.h</filename>. Similarly, the header
+        <filename>vector_td_utilities.h</filename> wraps common math
+        functionality for the <classname>vector_td</classname> class. Many
+        common operations that take one of more
+        <classname>cuNDArray</classname> instances with element type
+        <classname>vector_td</classname> are defined in
+        <filename>ndarray_vector_utilities.h</filename>. We encourage the
+        reader to explore these utilities on his own.</para>
+
+        <para>The <classname>vector_td</classname> can be used in both host
+        and device code. As an example of use it is contained in the interface
+        of the non-Cartesian FFT described in <xref
+        linkend="sect.NFFT"/>.</para>
+      </sect2>
+
+      <sect2>
+        <title>complext</title>
+
+        <para>A complex number class that can be used in both host and device
+        code is found in <filename>complext.h</filename>. It contains a
+        substantial set of useful operators and functions.</para>
+      </sect2>
+
+      <sect2 xml:id="sect.ffttoolbox">
+        <title>Fourier Transforms</title>
+
+        <sect3>
+          <title>Cartesian FFT</title>
+
+          <sect4>
+            <title>FFT of a <classname>hoNDArray</classname></title>
+
+            <para>The Gadgetron uses the FFTW library for Fourier transform of
+            <classname>hoNDArray</classname> structures. Users can call the
+            FFTW directly from their code, but to make things a little easier,
+            we provide a simple wrapper class defined in
+            <filename>toolboxes/ndarray/FFT.h</filename>. Here is an
+            abbreviated version:</para>
+
+            <programlisting>template <typename T> class EXPORTNDARRAY FFT
+{
+
+public:
+ static FFT<T>* instance(); 
+
+ void fft(hoNDArray< std::complex<T> >* input, 
+          unsigned int dim_to_transform);
+
+ void ifft(hoNDArray< std::complex<T> >* input, 
+          unsigned int dim_to_transform);
+
+ void fft(hoNDArray< std::complex<T> >* input);
+
+ void ifft(hoNDArray< std::complex<T> >* input);
+
+protected:
+ FFT();
+ virtual ~FFT();
+};</programlisting>
+
+            <para>The <classname>FFT</classname> class provides simple wrapper
+            functionality to perform FFTs of <classname>hoNDArray</classname>s
+            along a specific dimension or along all dimensions. It performs
+            <emphasis>in place</emphasis> FFTs and works on complex arrays of
+            single or double precision.</para>
+
+            <para>An important feature of this class is that it is a process
+            wide singleton for the Gadgetron. As outlined in the definition
+            above, the constructor and destructor are protected and it is not
+            possible to allocate a new <classname>FFT</classname> object. The
+            way to use the class is through the <function>instance
+            </function>function:</para>
+
+            <programlisting>#include "FFT.h"
+
+FFT<float>::instance()->fft(...);</programlisting>
+
+            <para>The reason for this is that the FFTW planning routines are
+            not thread safe. Multiple Gadgets (that each have their own thread
+            of execution) may need to use FFTs and consequently the planning
+            routines need to be protected with a mutex. All of this is handled
+            inside the <classname>FFT</classname> class and since it is a
+            singleton only one thread can run the planning routines at any
+            given time.</para>
+
+            <para>As mentioned it is possible for the users to call FFTW
+            routines directly, and there may be some performance reasons for
+            doing so (as opposed to using this wrapper), but please be aware
+            of this thread safety issue when you design your Gadgets. If you
+            want to be on the safe side, use the wrapper.</para>
+          </sect4>
+
+          <sect4>
+            <title>FFT of a <classname>cuNDArray</classname></title>
+
+            <para>Cartesian Fast Fourier Transform on the GPU is supported by
+            wrapping Cuda's FFT routines as defined in
+            <filename>cuNDFFT.h</filename>.</para>
+
+            <programlisting>template<class T> class EXPORTGPUCORE cuNDFFT
+{
+ public:
+
+  cuNDFFT() {}
+  virtual ~cuNDFFT() {}
+
+  int fft ( cuNDArray<T> *input, 
+    std::vector<unsigned int> *dims_to_transform );
+
+  int ifft( cuNDArray<T> *input, 
+    std::vector<unsigned int> *dims_to_transform, 
+    bool do_scale = true );
+
+  int fft ( cuNDArray<T> *input, 
+    unsigned int dim_to_transform);
+
+  int ifft( cuNDArray<T> *input, 
+    unsigned int dim_to_transform, 
+    bool do_scale = true );
+
+  int fft ( cuNDArray<T> *input );
+  int ifft( cuNDArray<T> *input, 
+    bool do_scale = true );
+
+ protected:
+  int fft_int( cuNDArray<T> *input, 
+    std::vector<unsigned int> *dims_to_transform, 
+    int direction, 
+    bool do_scale = true );
+};
+ </programlisting>
+
+            <para>The interface defines forwards and inverse transforms of a
+            single array dimension, all dimensions of the array, or a subset
+            of dimensions.</para>
+          </sect4>
+        </sect3>
+
+        <sect3 xml:id="sect.NFFT">
+          <title>Non-Cartesian FFT</title>
+
+          <para>A dedicated GPU-implementation of the NUFFT - often referred
+          to a gridding - is provided. The interface is defined in
+          <filename>NFFT.h</filename> provided below in abbreviated
+          form</para>
+
+          <programlisting>template< class REAL, unsigned int D, bool ATOMICS=false > 
+class EXPORTGPUNFFT NFFT_plan
+{
+ public: // Main interface
+    
+  // Constructors
+  NFFT_plan();
+  NFFT_plan( typename uint64d<D>::Type matrix_size, 
+             typename uint64d<D>::Type matrix_size_os, 
+             REAL W, int device = -1 );
+
+  // Destructor
+  virtual ~NFFT_plan();
+
+  // Clear internal storage in plan
+  enum NFFT_wipe_mode { NFFT_WIPE_ALL, NFFT_WIPE_PREPROCESSING };
+  bool wipe( NFFT_wipe_mode mode );
+
+  // Replan 
+  bool setup( typename uint64d<D>::Type matrix_size, 
+              typename uint64d<D>::Type matrix_size_os, 
+              REAL W, int device = -1 );
+    
+  // Preproces trajectory 
+  // Cartesian to non-Cartesian / non-Cartesian to Cartesian / both
+  enum NFFT_prep_mode { NFFT_PREP_C2NC, 
+                        NFFT_PREP_NC2C, 
+                        NFFT_PREP_ALL };
+
+  bool preprocess
+    ( cuNDArray<typename reald<REAL,D>::Type> *trajectory, 
+      NFFT_prep_mode mode );
+    
+  // Execute NFFT 
+  // ( Cartesian to non-Cartesian or non-Cartesian to Cartesian)  
+  enum NFFT_comp_mode { NFFT_FORWARDS_C2NC, 
+                        NFFT_FORWARDS_NC2C, 
+                        NFFT_BACKWARDS_C2NC, 
+                        NFFT_BACKWARDS_NC2C };
+
+  bool compute( cuNDArray<complext<REAL> > *in, 
+                cuNDArray<complext<REAL> > *out, 
+                cuNDArray<REAL> *dcw, NFFT_comp_mode mode );
+
+  // Execute NFFT iteration 
+  // (Cartesian to non-Cartesian and back to Cartesian space)
+  bool mult_MH_M( cuNDArray<complext<REAL> > *in, 
+                  cuNDArray<complext<REAL> > *out, 
+                  cuNDArray<REAL> *dcw, 
+                  std::vector<unsigned int> halfway_dims );
+  
+ public: // Utilities
+  
+  // NFFT convolution 
+  // (Cartesian to non-Cartesian or non-Cartesian to Cartesian)
+  enum NFFT_conv_mode { NFFT_CONV_C2NC, NFFT_CONV_NC2C };
+  bool convolve( cuNDArray<complext<REAL> > *in, 
+                 cuNDArray<complext<REAL> > *out, 
+                 cuNDArray<REAL> *dcw, 
+                 NFFT_conv_mode mode, bool accumulate = false );
+    
+  // NFFT FFT
+  enum NFFT_fft_mode { NFFT_FORWARDS, NFFT_BACKWARDS };
+  bool fft( cuNDArray<complext<REAL> > *data, 
+            NFFT_fft_mode mode, bool do_scale = true );
+  
+  // NFFT deapodization
+  bool deapodize( cuNDArray<complext<REAL> > *image );
+
+ public: // Setup queries
+
+  typename uint64d<D>::Type get_matrix_size();
+  typename uint64d<D>::Type get_matrix_size_os();
+  REAL get_W();
+  unsigned int get_device();
+  
+...
+};</programlisting>
+
+          <para>After a <classname>NFFT_plan</classname> is constructed the
+          <function>preprocess</function> function should be called with the
+          desired trajectory. In the special case of radial sampling the
+          header <filename>radial_utilities.h</filename> defines some
+          convenient functions to compute radial trajectories and
+          corresponding density compensation weights. After preprocessing the
+          NFFT can be executed through the <function>compute</function>
+          function. The individual building blocks of the NFFT - convolution,
+          FFT, and deapodization - are exposed in the public interface and
+          hence available for use in custom algorithms.</para>
+
+          <para>It is often required to perform the NFFT on a number of
+          different inputs. Particularly in 1D and 2D the best performance is
+          obtained if many transforms are executed concurrently in order to
+          keep the device fully occupied. Two strategies can be
+          combined:<itemizedlist>
+              <listitem>
+                <para>The trajectory passed to the preprocess method is
+                normally a one-dimension cuNDArray containing normalized (to
+                the range [-0.5;0.5]) non-Cartesian positions as
+                <classname>reald<REAL,D></classname> elements of
+                precision <classname>REAL</classname> and dimensionality
+                <classname>D</classname>. However, if the cuNDArray is
+                two-dimensional, the latter dimension specifies that we wish
+                to transform a number of frames with different trajectories
+                concurrently.</para>
+              </listitem>
+
+              <listitem>
+                <para>If a number of transformations with identical
+                trajectories are to be transformed, the input and output
+                arrays to the compute methods can be any multiplum of the
+                Cartesian and non-Cartesian dimensions configured from the
+                <classname>setup</classname> and
+                <classname>preprocess</classname> methods. The images provided
+                are consequently batch transformed.</para>
+              </listitem>
+            </itemizedlist></para>
+
+          <para><remark>Please note</remark>. The NFFT performs significantly
+          better on GPUs supporting Cuda's shader model 2.0 or newer compared
+          to devices supporting only shader models 1.x. The reason being that
+          we rely on the inherent caching of global memory - available only on
+          hardware supporting at least shader model 2.0.</para>
+
+          <para>As of the Gadgetron release 1.1 a version of the NFFT
+          implemented using atomic operations is available. It is enabled
+          through the ATOMICS booloean template arguments (and defaults to
+          false, i.e. disabled). At the time of the Gadgetron 1.1 release the
+          current generation hardware showed inferior performed using the
+          atomic version over the non-atomic version. However, using the
+          atomic version significantly reduces the memory requirements, and
+          could thus be the only viable option, particularly for three- or
+          four-dimensional reconstructions, on GPUs lacking sufficient
+          memory.</para>
+        </sect3>
+      </sect2>
+
+      <sect2 xml:id="sect.matrix_operators">
+        <title>Linear (Matrix) Operators</title>
+
+        <para>A fundamental building block of most image reconstruction
+        algorithms is the abstract class
+        <classname>linearOperator</classname>. A range of linear imaging and
+        regularization operators are inherited from this pure virtual base
+        class (abbreviated):<programlisting>template < class REAL, class ARRAY_TYPE > class linearOperator
+{
+ public:
+
+  linearOperator() { weight_ = REAL(1); }
+
+  virtual ~linearOperator() {}
+
+  virtual void set_weight( REAL weight ){ weight_ = weight; }
+  virtual REAL get_weight(){ return weight_; }
+
+  virtual bool set_domain_dimensions
+    ( std::vector<unsigned int> *dims ) { ... }
+  virtual bool set_codomain_dimensions
+    ( std::vector<unsigned int> *dims ) { ... }
+
+  virtual boost::shared_ptr< std::vector<unsigned int> > 
+    get_domain_dimensions() { ... }
+
+  virtual boost::shared_ptr< std::vector<unsigned int> > 
+    get_codomain_dimensions() { ... }
+
+  virtual int mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, 
+                      bool accumulate = false) = 0;
+  virtual int mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, 
+                       bool accumulate = false) = 0;
+  virtual int mult_MH_M( ARRAY_TYPE* in, ARRAY_TYPE* out, 
+                       bool accumulate = false)
+  {
+    // Perform mult_M followed by mult_MH
+    ...
+  }
+  
+  virtual boost::shared_ptr< 
+    linearOperator< REAL, ARRAY_TYPE > > clone() = 0;
+
+  ...
+};</programlisting>The <classname>linearOperator</classname> is templated by
+        two arguments: 1) the basic precision <classname>REAL</classname>
+        (<classname>e.g. float</classname> or <classname>double</classname>)
+        and 2) the <classname>ARRAY_TYPE</classname> (e.g.
+        <classname>hoNDArray<T></classname> or
+        <classname>cuNDArray<T></classname>) representing the expected
+        vector format for the matrix-vector multiplication the operator
+        implements.</para>
+
+        <para>Every <classname>MatrixOperator</classname> has an associated
+        weight that is used to balance multiple matrix terms when added to a
+        cost function (see <xref linkend="sect.linear_solvers"/>).</para>
+
+        <para>The main functionality is provided in the two pure virtual
+        functions <function>mult_M</function> and <function>mult_MH</function>
+        denoting multiplication with the matrix operator
+        (<varname>M</varname>) and multiplication with the adjoint (i.e.
+        conjugate transpose) of the matrix operator
+        (<varname>M<superscript>H</superscript></varname>) respectively. The
+        default implementation of <function>mult_MH_M</function> computes an
+        "iteration" of the two
+        (<varname>M<superscript>H</superscript>M</varname>) by invoking
+        <function>mult_M</function> and <function>mult_MH</function> in turn.
+        Specialized operators can redefine the virtual
+        <function>mult_MH_M</function> to increase performance when
+        appropriate.</para>
+
+        <para>The <classname>clone</classname> method is required by some
+        solvers to make a clone (copy) of a given
+        <classname>linearOperator</classname>. Similarly, some solvers require
+        knowledge of the <varname>domain</varname> and
+        <varname>codomain</varname> dimensions on which the operator can be
+        applied. The <classname>mult_M</classname> method converts the input
+        vector of <varname>domain_size</varname> to one of
+        <varname>codomain_size</varname> - and vice versa for
+        <classname>mult_MH</classname>.</para>
+
+        <para>The <classname>linearOperator</classname> is used to model a
+        linear imaging modality's encodig operation (Fourier transform for
+        MRI, Radon transform for CT, convolution for Microscopy etc.) but also
+        common regularization operators such the identity matrix, the partial
+        derivatives etc.</para>
+
+        <para>Here follows a list that briefly describes the linear operators
+        that are used for the reconstruction examples discussed later in this
+        document (<xref linkend="sect.exampleapplications"/>, <xref
+        linkend="sect.standalone_applications"/>).</para>
+
+        <sect3>
+          <title>List of linear operators</title>
+
+          <para>The section provides a non-exhaustive list of available linear
+          operators in Gadgetron toolboxes.</para>
+
+          <para>A two-level implementation strategy is used for most of the
+          operators the Gadgetron provide. We first derive a class, say
+          <classname>identityOperator</classname>, from the
+          <classname>linearOperator</classname> base class. In this derived
+          class we implement the pure virtual functions of the base class,
+          e.g. <function>mult_M</function>, <function>mult_MH</function>, and
+          <function>mult_MH_M</function>. The overall algorithm and
+          functionality of the operator is implemented at this level. Like its
+          superclass, the <classname>identityOperator</classname> is however
+          templated on the underlying <classname>ARRAY_TYPE</classname> and
+          thus cannot contain dedicated implementation code to a specific
+          array implementation. The implementation of
+          <function>mult_M</function>, <function>mult_MH</function>, and
+          <function>mult_MH_M</function> is consequently based on a new set of
+          pure virtual functions of the templated
+          <classname>ARRAY_TYPE</classname>. We provide another level of
+          inheritance, e.g. <classname>cuIdentityOperator</classname>, which
+          in this case provides the <classname>cuNDArray</classname>-specific
+          implementation of the pure virtual function in
+          <classname>identityOperator</classname>. This hierarchy has the
+          desired design goal, that the core algorithm implementation is
+          shared in the base class of the operator. Only the host/device
+          specific sub-components are defined individually. It is thus fairly
+          straightforward to derive both an <classname>cuNDArray</classname>
+          and an <classname>hoNDArray</classname> version of an
+          operator.</para>
+
+          <para>As an example we provide a simplified declaration of the
+          <classname>identityOperator</classname> and
+          <classname>cuIdentityOperator</classname> below. Without specific
+          mentioning for the subsequent operators, many follow a similar
+          inheritance hierarchy.</para>
+
+          <itemizedlist>
+            <listitem>
+              <para><classname>identityOperator</classname></para>
+
+              <para>Implements multiplication of a vector with the identity
+              matrix.</para>
+
+              <para><programlisting>// Notice: simplified code without error-checking
+
+template <class REAL, class ARRAY_TYPE> class identityOperator
+ : public linearOperator<REAL, ARRAY_TYPE>
+{
+ public:
+
+  identityOperator() : linearOperator<REAL, ARRAY_TYPE>() {}
+  virtual ~identityOperator() {}
+  
+  // operator_xpy computes "x+y" and stores the result in y
+  virtual bool operator_xpy( ARRAY_TYPE *x, ARRAY_TYPE *y ) = 0;
+
+  virtual int mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, 
+                      bool accumulate = false )
+  {
+    if( accumulate )
+      operator_xpy( in, out );
+    else 
+      *out = *in;
+  }
+
+  ... // Similar code for mul_MH and mult_MH_M
+};
+
+</programlisting><parameter>The Cuda specific
+              implementation:</parameter><programlisting>// Notice: 
+// Simplified code without error checking and multi-device support
+
+template <class REAL, class T> 
+class cuIdentityOperator 
+: public identityOperator< REAL, cuNDArray<T> >
+{
+ public:
+
+  cuIdentityOperator() : 
+    identityOperator< REAL, cuNDArray<T> >() {}
+  
+  virtual ~cuIdentityOperator() {}
+  
+  virtual bool operator_xpy( cuNDArray<T> *x, cuNDArray<T> *y )
+  { 
+    return cuNDA_axpy( T(1), x, y );
+  }
+
+ ...
+};
+
+</programlisting>Notice that the template arguments to the
+              <classname>cuIdentitytOperator</classname> differ from its base
+              class. <classname>REAL</classname> specifies the desired
+              precision (<classname>float</classname> or
+              <classname>double</classname>) and <classname>T</classname>
+              specifies the desired type - which could be identical to
+              <classname>REAL</classname> or e.g. a
+              <classname>complext<REAL></classname>. Also notice how the
+              <classname>cuIdentitytOperator</classname> class definition
+              directly specifies the ARRAY_TYPE of its superclass (in this
+              case to be of type
+              <classname>cuNDArray<T></classname>).</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>partialDerivativeOperator</classname></para>
+
+              <para>Provides the partial derivative of an image in a given
+              spatial dimension.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>laplaceOperator</classname></para>
+
+              <para>Computes the Laplacian of an image.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>imageOperator</classname></para>
+
+              <para>Performs multiplication with a diagonal matrix of the
+              element-wise reciprocal of a given image.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>convolutionOperator</classname></para>
+
+              <para>Performs convolution of an image with a given
+              kernel.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>nfftOperator</classname></para>
+
+              <para>Implements the non-Cartesian Fast Fourier Transform</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>senseOperator</classname></para>
+
+              <para>Implements the encoding operator for the parallel MRI
+              imaging technique Sense. Comes in two flavours for 1) Cartesian
+              and 2) non-Cartesian reconstruction.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>multiplicationOperatorContainer</classname></para>
+
+              <para>An operator can often be considered the result of
+              multiplicative concatenation of a sequence of simpler linear
+              operators. The
+              <classname>multiplicationOperatorContainer</classname> defines a
+              convenient interface to ease the construction of such
+              concatenations.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>encodingOperatorContainer</classname></para>
+
+              <para>As we require exactly one encoding operator (but allow
+              multiple regularization operators) to be added to our solvers
+              (see <xref linkend="sect.linear_solvers"/> below), this operator
+              acts as a container when multiple encoding operators are
+              desired. For example: The cost function right below (<xref
+              linkend="sect.cg_solver"/>) has two terms in its general form.
+              Most often the vector <emphasis role="bold">p</emphasis> is
+              <emphasis role="bold">0</emphasis> and consequently the operator
+              <emphasis role="bold">R</emphasis> is considered a
+              regularization operator while the operator <emphasis
+              role="bold">E</emphasis> the single encoding operator. However,
+              if <emphasis role="bold">p</emphasis> is non-zero, both
+              <emphasis role="bold">E</emphasis> and <emphasis
+              role="bold">R</emphasis> must be added to an
+              <classname>encodingOperatorContainer</classname> that takes in
+              both <emphasis role="bold">m</emphasis> and <emphasis
+              role="bold">p</emphasis> during multiplication. A single
+              <classname>encodingOperatorContainer</classname> is then added
+              to the solver.</para>
+            </listitem>
+          </itemizedlist>
+        </sect3>
+      </sect2>
+
+      <sect2 xml:id="sect.linear_solvers">
+        <title>Linear Solvers</title>
+
+        <para>The Gadgetron's solvers toolbox contains both a generic
+        conjugate gradient solver to solve linear least squares reconstruction
+        problems (see <xref linkend="sect.linear_solvers"/> ) and a two
+        flavors of a Split Bregman solver for non-linear problems using
+        l1-norms for regularization (see <xref
+        linkend="sect.nonlinear_solvers"/>). More solvers can be expected in
+        upcoming releases.</para>
+
+        <sect3 xml:id="sect.cg_solver">
+          <title>Conjugate Gradient Method for Linear Least Squares</title>
+
+          <para>The conjugate gradient solver is used to reconstruct an image
+          posed as a minimizer to an l2-based optimization problem:</para>
+
+          <informalequation>
+            <mediaobject>
+              <imageobject role="html">
+                <imagedata fileref="figs/math/lls.jpg" format="JPEG"
+                           width="3in"/>
+              </imageobject>
+
+              <imageobject role="fo">
+                <imagedata fileref="figs/math/lls.jpg" format="JPEG"
+                           width="3in"/>
+              </imageobject>
+            </mediaobject>
+          </informalequation>
+
+          <para>The unknown image to be reconstructed is denoted here by
+          <emphasis role="bold">u</emphasis> and the measured data by
+          <emphasis role="bold">m</emphasis>. <emphasis
+          role="bold">E</emphasis> is a linear operator modelling the encoding
+          of the imaging modality (e.g. a Fourier transform for MRI, a Radon
+          transform for CT etc.). <emphasis role="bold">R</emphasis> is a
+          regularization operator often required to ensure uniqueness of the
+          solution. Lambda is a scalar weight (with a default value of one)
+          associated to each matrix operator and used to balance the various
+          terms in the cost function. Finally <emphasis
+          role="bold">p</emphasis> denotes some (possibly blank) prior image
+          in the regularization term. Any number of terms can be added.</para>
+
+          <para>The closed form solution to the optimization problem is given
+          by the linear system of equations:</para>
+
+          <informalequation>
+            <mediaobject>
+              <imageobject role="html">
+                <imagedata fileref="figs/math/lls_form.jpg" format="JPEG"
+                           width="3in"/>
+              </imageobject>
+
+              <imageobject role="fo">
+                <imagedata fileref="figs/math/lls_form.jpg" format="JPEG"
+                           width="3in"/>
+              </imageobject>
+            </mediaobject>
+          </informalequation>
+
+          <para>Put extremely short; you set up and run a solver by 1) adding
+          the corresponding linear operators to the solver, and 2) invoking
+          the <function>solve</function> function in the solver providing
+          <emphasis role="bold">m</emphasis> (and <emphasis
+          role="bold">p</emphasis> if non-zero) as input arguments.</para>
+
+          <para>An abbreviated version of the interface to the conjugate
+          gradient solver is shown here</para>
+
+          <para><programlisting>// Defined in solver.h
+
+template <class ARRAY_TYPE_IN, class ARRAY_TYPE_OUT> 
+class solver
+{
+public:
+
+  // Constructor/destructor
+  //
+
+  solver() { output_mode_ = OUTPUT_SILENT; }
+  virtual ~solver() {}
+  
+  // Output modes
+  //
+
+  enum solverOutputModes { OUTPUT_SILENT = 0, 
+                           OUTPUT_WARNINGS = 1, 
+                           OUTPUT_VERBOSE = 2, 
+                           OUTPUT_MAX = 3 };
+  
+  // Set/get output mode
+  //
+
+  virtual int get_output_mode() { return output_mode_; }
+
+  virtual void set_output_mode( int output_mode ) {
+      output_mode_ = output_mode;
+  }
+  
+  // Set/get starting solution/estimate for solver
+  //
+
+  virtual void set_x0( boost::shared_ptr<ARRAY_TYPE_OUT> x0 )
+    { x0_ = x0; }
+
+  virtual boost::shared_ptr<ARRAY_TYPE_OUT> get_x0()
+    { return x0_; }
+
+  // Default error output
+  //
+
+  virtual void solver_error( std::string msg ) { ... }
+
+  // Default warning output
+  //
+
+  virtual void solver_warning( std::string msg ) { ... }
+
+  // Invoke solver
+  //
+
+  virtual boost::shared_ptr<ARRAY_TYPE_OUT> solve
+    ( ARRAY_TYPE_IN* ) = 0;
+ 
+protected:
+  int output_mode_;
+  boost::shared_ptr<ARRAY_TYPE_OUT> x0_;
+};</programlisting></para>
+
+          <para>The abstract <classname>cgSolver</classname> class:</para>
+
+          <para><programlisting>// Defined in cgSolver.h
+
+template <class REAL, 
+          class ELEMENT_TYPE, 
+          class ARRAY_TYPE> 
+class cgSolver : public linearSolver
+  <REAL, ELEMENT_TYPE, ARRAY_TYPE>
+{
+public:
+
+  // Class defining the termination criterium
+  //
+
+  friend class cgTerminationCallback
+    <REAL, ELEMENT_TYPE, ARRAY_TYPE>;
+
+  // Constructor / destructor
+  //
+
+  cgSolver() : linearSolver<REAL, ELEMENT_TYPE, ARRAY_TYPE>() 
+   {...}
+ 
+  virtual ~cgSolver() {}
+
+  // Set preconditioner
+  //
+
+  virtual void set_preconditioner( 
+    boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+  }
+  
+  // Set termination callback
+  //
+
+  virtual void set_termination_callback(
+    boost::shared_ptr< cgTerminationCallback
+      <REAL, ELEMENT_TYPE, ARRAY_TYPE> > cb ){
+      cb_ = cb;
+  }
+
+  // Set/get maximally allowed number of iterations
+  //
+
+  virtual void set_max_iterations( unsigned int iterations ) { 
+    iterations_ = iterations; }
+
+  virtual unsigned int get_max_iterations() { return iterations_; }  
+
+  // Set/get tolerance threshold for termination criterium
+  //
+
+  virtual void set_tc_tolerance( REAL tolerance ) 
+    { tc_tolerance_ = tolerance; }
+
+  virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+  
+  // Pre/post solver callbacks
+  //
+
+  virtual bool pre_solve( ARRAY_TYPE** ) { return true; }
+  virtual bool post_solve( boost::shared_ptr<ARRAY_TYPE>& ) 
+    { return true; }
+
+  // Pure virtual functions defining core solver functionality
+  // Implemented on the host/device respectively in a derived class
+  //
+
+  virtual ELEMENT_TYPE solver_dot( ARRAY_TYPE*, ARRAY_TYPE* ) = 0;
+  virtual bool solver_clear( ARRAY_TYPE* ) = 0;
+  virtual bool solver_scal( ELEMENT_TYPE, ARRAY_TYPE* ) = 0;
+  virtual bool solver_dump( ARRAY_TYPE* ) { return true; }
+  virtual bool solver_axpy
+    ( ELEMENT_TYPE, ARRAY_TYPE*, ARRAY_TYPE* ) = 0;
+
+  //
+  // Main solver interfaces
+  //
+
+  virtual boost::shared_ptr<ARRAY_TYPE> solve( ARRAY_TYPE *_d ) 
+    { ... }
+
+  virtual boost::shared_ptr<ARRAY_TYPE> solve_from_rhs
+    ( ARRAY_TYPE *_rhs ) { ... }
+
+  ...
+};
+</programlisting><parameter>The Cuda specific
+          implementation:</parameter><programlisting>// Defined in cuCGSolver.h
+
+template <class REAL, class T> class cuCgSolver 
+  : public cgSolver< REAL, T, cuNDArray<T> >
+{
+public:
+
+  cuCgSolver() : cgSolver< REAL, T, cuNDArray<T> >() { ... }
+  virtual ~cuCgSolver() {}
+
+  cuCGSolver() : cgSolver< REAL, T, cuNDArray<T> >() {}
+  virtual ~cuCGSolver() {}
+
+  virtual bool pre_solve(cuNDArray<T>**)
+   { ... }
+
+  virtual bool post_solve(cuNDArray<T>**)
+   { ... }
+
+  virtual void solver_error( std::string err )
+   { ... }
+
+  virtual T solver_dot( cuNDArray<T> *x, 
+   cuNDArray<T> *y ){ ... }
+
+  virtual bool solver_clear( cuNDArray<T> *x )
+   { ... }
+
+  virtual bool solver_scal( T a, 
+   cuNDArray<T> *x ){ ... }
+
+  virtual bool solver_axpy( T a, cuNDArray<T> *x, 
+   cuNDArray<T> *y ){ ... }
+
+  ...
+};</programlisting>The overall inheritance hierarchy is modelled and
+          implemented similarly to the <classname>linearOperator</classname>
+          class hierarchy described above (see <xref
+          linkend="sect.matrix_operators"/>). To use the solver the user
+          creates an instance of the solver for either the host or device
+          (e.g. the <classname>cuCGSolver</classname> above for a GPU-based
+          solver). The solver is configured using the functions in the
+          <classname>cgSolver</classname> base class. The core solve function
+          itself is found in the root of the hierarchy; the
+          <classname>solver</classname>.</para>
+
+          <para>Note that any number of terms (linear operators) can be added
+          to the solver (or cost function).</para>
+
+          <para>The following code listing provides a short example of how to
+          define a conjugate gradient solver for GPU-based image deblurring
+          given an image and an estimate of the point spread function that
+          degraded the image. It uses the
+          <classname>convolutionOperator</classname> to model the blurring and
+          a <classname>partialDerivativeOperator</classname> in each spatial
+          dimension for regularization. The full code can be found in
+          <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp</filename>.<programlisting>{
+  << Code that parses the command line 
+     and loads the image and kernel from disk >>
+
+  // Define the desired precision
+  typedef float _real; 
+  typedef complext<_real>::Type _complext;
+
+  // Upload host data to device
+  cuNDArray<_complext> data(host_data.get());
+  cuNDArray<_complext> kernel(host_kernel.get());
+    
+  // Setup regularization operators
+
+  boost::shared_ptr
+    < cuPartialDerivativeOperator<_real,_complext,2> > 
+      Rx( new cuPartialDerivativeOperator<_real,_complext,2>(0) ); 
+
+  boost::shared_ptr
+    < cuPartialDerivativeOperator<_real,_complext,2> > 
+      Ry( new cuPartialDerivativeOperator<_real,_complext,2>(1) ); 
+
+  Rx->set_weight( lambda );
+  Ry->set_weight( lambda );
+     
+  //
+  // Setup conjugate gradients solver
+  //
+
+  // Define encoding matrix
+  boost::shared_ptr< cuConvolutionOperator<_real,2> > 
+    E( new cuConvolutionOperator<_real,2>() );
+
+  E->set_kernel( &kernel );
+  E->set_domain_dimensions(data.get_dimensions().get());
+    
+  // Setup conjugate gradient solver
+  cuCGSolver<_real, _complext> cg;
+
+  // encoding matrix
+  cg.set_encoding_operator( E );
+
+  // regularization matrix                   
+  if( kappa>0.0 ) cg.add_reuglarization_operator( Rx );
+  
+  // regularization matrix
+  if( kappa>0.0 ) cg.add_regularization_operator( Ry ); 
+
+  cg.set_max_iterations( num_iterations );
+  cg.set_tc_tolerance( 1e-8 );
+  cg.set_output_mode( cuCGSolver<_real, _complext>::OUTPUT_VERBOSE );
+                
+  //
+  // Conjugate gradient solver
+  //
+  
+  boost::shared_ptr< cuNDArray<_complext> > 
+    cgresult = cg.solve(&data);
+
+  // All done, write out the result
+  
+  boost::shared_ptr< hoNDArray<_complext> > 
+    host_result = cgresult->to_host();
+  
+  write_nd_array<_complext>(host_result.get(), 
+    (char*)parms.get_parameter('r')->get_string_value());
+}</programlisting></para>
+
+          <para>For an overview of the various standalone applications the
+          Gadgetron provides - and instruction on how to run them - we refer
+          to <xref linkend="sect.standalone_applications"/>.</para>
+        </sect3>
+      </sect2>
+
+      <sect2 xml:id="sect.nonlinear_solvers">
+        <title>Non-linear Solvers</title>
+
+        <sect3>
+          <title>Split Bregman Solver for L1-regularized Problems</title>
+
+          <para>The Gadgetron includes two Split Bregman solvers to solve
+          respectively</para>
+
+          <informalequation>
+            <mediaobject>
+              <imageobject role="html">
+                <imagedata fileref="figs/math/sb.jpg" format="JPEG"
+                           width="3in"/>
+              </imageobject>
+
+              <imageobject role="fo">
+                <imagedata fileref="figs/math/sb.jpg" format="JPEG"
+                           width="3in"/>
+              </imageobject>
+            </mediaobject>
+          </informalequation>
+
+          <para>where |.|<subscript>TV</subscript> denotes the Total Variation
+          norm. The solver to the upper (unconstraint) optimization problem is
+          defined in <filename>sbSolver.h</filename> while the solver to the
+          latter constraint problem declared in
+          <filename>sbcSolver.h</filename>. The Split Bregman solver was
+          chosen as it integrates nicely with the linear conjugate solver
+          described above (<xref linkend="sect.linear_solvers"/>). In fact,
+          most of the work in the two Split Bregman solvers is performed by a
+          linear inner solver (e.g. a conjugate gradient solver), but the
+          input (right hand side) to the inner solver varies from iteration to
+          iteration.</para>
+
+          <para>The interface to the unconstraint Split Bregman solver is
+          given here. We have seen the overall inheritance hierarchy several
+          times already, so it should suffice to provide only very abbreviated
+          headers here:<programlisting>// Defined in sbSolver.h
+
+
+template< class REAL, 
+          class ELEMENT_TYPE, 
+          class ARRAY_TYPE_REAL, 
+          class ARRAY_TYPE_ELEMENT, 
+          class INNER_SOLVER,
+          class OPERATOR_CONTAINER > class sbSolver 
+
+ : public linearSolver<REAL, ELEMENT_TYPE, ARRAY_TYPE_ELEMENT>
+{
+public:
+
+  // Constructor
+  //
+
+  sbSolver() : linearSolver<REAL, ELEMENT_TYPE, ARRAY_TYPE_ELEMENT>() 
+   { ... }
+  
+  // Destructor
+  //
+
+  virtual ~sbSolver() {}
+   
+
+  // Add regularization group operator 
+  // (isotropic, multiple operators per group allowed)
+  //
+
+  virtual bool add_regularization_group_operator( 
+    boost::shared_ptr< linearOperator<REAL, ARRAY_TYPE_ELEMENT> > op ) 
+  { ... }
+
+  // Add isotroic regularization group (multiple groups allowed)
+  //
+
+  virtual bool add_group() { ... }
+
+  // Get regularization group operator
+  //
+ 
+  < omitted for brevity>
+  
+  // Set/get prior image (PICCS style). 
+  // I.e. for every regularization operator (group) 
+  // R that is added we minimize:
+  // alpha|R(x-prior)|_{l1} + (1-alpha)|R(x)|_{l1}
+  //
+
+  virtual bool set_prior_image( 
+    boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior, REAL alpha )
+  { ... }
+ 
+  // Get the prior image and corresponding weighing factor
+  //
+  
+  virtual boost::shared_ptr<ARRAY_TYPE_ELEMENT> get_prior_image() 
+    { return prior_; }
+
+  virtual REAL get_prior_alpha() { return alpha_; }
+
+
+  // Set termination criterium tolerance
+  //
+
+  virtual void set_tc_tolerance( REAL tolerance ) 
+  { ... }
+
+  // Set/get maximum number of outer Split-Bregman iterations
+  //
+
+  virtual void set_max_outer_iterations( 
+    unsigned int iterations ) { outer_iterations_ = iterations; }
+ 
+  virtual unsigned int get_max_outer_iterations() { 
+    return outer_iterations_; }
+
+  // Set/get maximum number of inner Split-Bregman iterations
+  //
+
+  virtual void set_max_inner_iterations( 
+    unsigned int iterations ) { inner_iterations_ = iterations; }
+
+  virtual unsigned int get_max_inner_iterations() 
+   { return inner_iterations_; }
+
+  // Get the inner solver
+  //
+
+  virtual boost::shared_ptr<INNER_SOLVER> get_inner_solver() 
+   { return inner_solver_; }
+  
+
+  // Core solver functionality to be implemented
+  // in a derived class (host/device specific implementations)
+  //
+
+  virtual bool solver_clear_real( ARRAY_TYPE_REAL* ) = 0;
+
+  virtual bool solver_clear_element( ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual bool solver_sqrt( ARRAY_TYPE_REAL* ) = 0;
+
+  virtual bool solver_scal( ELEMENT_TYPE, 
+                            ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual bool solver_axpy_real( REAL, ARRAY_TYPE_REAL*, 
+                                 ARRAY_TYPE_REAL* ) = 0;
+
+  virtual bool solver_axpy_element( ELEMENT_TYPE, 
+                                    ARRAY_TYPE_ELEMENT*, 
+                                    ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual REAL solver_asum( ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual boost::shared_ptr<ARRAY_TYPE_REAL> solver_abs
+    ( ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual boost::shared_ptr<ARRAY_TYPE_REAL> solver_norm
+    ( ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual bool solver_shrink1( REAL, ARRAY_TYPE_ELEMENT*, 
+                               ARRAY_TYPE_ELEMENT* ) = 0;
+
+  virtual bool solver_shrinkd( REAL, ARRAY_TYPE_REAL*, 
+                               ARRAY_TYPE_ELEMENT*, 
+                               ARRAY_TYPE_ELEMENT* ) = 0;
+
+  //
+  // Main solver interface
+  //
+
+  virtual boost::shared_ptr<ARRAY_TYPE_ELEMENT> solve
+   ( ARRAY_TYPE_ELEMENT *f ) { ... }
+
+ ...
+};
+</programlisting><programlisting>// Defined in cuSbCgSolver.h
+
+template <class REAL, class T> class cuSbCgSolver 
+  : public sbSolver< REAL, T, cuNDArray<REAL>, 
+                     cuNDArray<T>, cuCgSolver<REAL,T>, 
+                     cuEncodingOperatorContainer<REAL,T> >
+{
+public:
+  
+  cuSbCgSolver() : sbSolver< REAL, T, cuNDArray<REAL>, 
+                             cuNDArray<T>, cuCgSolver<REAL,T>, 
+                             cuEncodingOperatorContainer<REAL,T> >() 
+  { ... }
+  
+  virtual ~cuSbCgSolver() {}
+
+  // Implementation of pure virtual functions
+  ...
+};</programlisting></para>
+
+          <para>To run the algorithm on the GPU the user would create an
+          instance of a <classname>cuSbCgSolver</classname> providing the two
+          template arguments; the desired precision and data type. Prior to
+          running the <function>solve</function> function with the measured
+          data <emphasis role="bold">m</emphasis>, the user should provide 1)
+          the encoding operator, 2) the regularization operators, and 3) the
+          desired domain and codomain dimensions as these cannot in general be
+          deduced from the measured data.</para>
+
+          <para>We outline the code required to set up the solver for TV-based
+          image denoising. The full code can be found in
+          <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/denoising/2d/denoise_TV.cpp</filename>.<programlisting>{
+  << Command line parsing and data loading >>
+  
+  //
+  // Setup regularization operators
+  // 
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_real,_real,2> > 
+    Rx( new cuPartialDerivativeOperator<_real,_real,2>(0) ); 
+
+  boost::shared_ptr< cuPartialDerivativeOperator<_real,_real,2> > 
+    Ry( new cuPartialDerivativeOperator<_real,_real,2>(1) ); 
+
+  Rx->set_weight( lambda );
+  Rx->set_domain_dimensions(data.get_dimensions().get());
+  Rx->set_codomain_dimensions(data.get_dimensions().get());
+
+  Ry->set_weight( lambda );
+  Ry->set_domain_dimensions(data.get_dimensions().get());
+  Ry->set_codomain_dimensions(data.get_dimensions().get());
+
+  // Define encoding operator (identity)
+  boost::shared_ptr< cuIdentityOperator<_real,_real> > 
+    E( new cuIdentityOperator<_real,_real>() );
+
+  E->set_weight( mu );
+  E->set_domain_dimensions(data.get_dimensions().get());
+  E->set_codomain_dimensions(data.get_dimensions().get());
+
+  // Setup split-Bregman solver
+  //
+
+  cuSbCgSolver<_real,_real> sb;
+
+  sb.set_encoding_operator( E );
+
+  sb.add_regularization_group_operator( Rx );
+  sb.add_regularization_group_operator( Ry);
+  sb.add_group();
+
+  sb.set_max_outer_iterations(num_outer_iterations);
+  sb.set_max_inner_iterations(num_inner_iterations);
+
+  sb.set_output_mode( cuCgSolver<_real,_real>::OUTPUT_VERBOSE );
+  
+  // Setup inner conjugate gradient solver
+  //
+
+  sb.get_inner_solver()->set_max_iterations
+   ( num_cg_iterations );
+  sb.get_inner_solver()->set_tc_tolerance( 1e-4 );
+  sb.get_inner_solver()->set_output_mode
+   ( cuCgSolver<_real,_real>::OUTPUT_WARNINGS );  
+
+  //
+  // Run split-Bregman solver
+  //
+
+  boost::shared_ptr< cuNDArray<_real> > 
+   sbresult = sb.solve(&data);
+
+  << do something with the result >>
+}</programlisting></para>
+
+          <para>The constrained Split Bregman solver inherits from the
+          unconstaint Split Bregman solver is thus defined with an identical
+          interface.</para>
+        </sect3>
+      </sect2>
+    </sect1>
+
+    <sect1>
+      <title>Gadgetron Gadgets</title>
+
+      <para>Gadgets wrap the functionality of the toolboxes and provide
+      generic building blocks for configuring the streaming reconstruction in
+      the Gadgetron.</para>
+
+      <sect2 xml:id="sect.mrigadgets">
+        <title>MRI Gadgets</title>
+
+        <para>One of the original motivations for creating the Gadgetron was
+        to make a high throughput MRI reconstruction engine that could be
+        interfaced to different MRI vendor systems. Consequently, a lot of the
+        functionality present in the initial release toolboxes and Gadgets is
+        focused on MRI reconstruction. In this section we review the basic
+        data structures used to describe MRI data and list some of the MRI
+        Gadgets that are available. These Gadgets are used in several of the
+        example applications in <xref
+        linkend="sect.exampleapplications"/>.</para>
+
+        <sect3 xml:id="sect.mridatastructures">
+          <title>MRI Data Structures</title>
+
+          <para>MRI data is processed in two different phases. In the first
+          phase individual data (k-space) acquisitions are processed while in
+          the second phase these acquisitions have been combined into images
+          (which may still be in k-space). Correspondingly, there are two
+          different types of Gadgets that dominate the MRI Gadgets; those who
+          operate on individual acquisitions and those who operate on images.
+          Naturally, there are also transitional Gadgets that operate on
+          acquisitions but output images.</para>
+
+          <para>The data header structures used by these MRI Gadgets are
+          defined by the ISMRM Raw Data format (<uri
+          xlink:href="http://ismrmrd.sourceforge.net">http://ismrmrd.sourceforge.net</uri>).</para>
+
+          <para>Most MRI Gadgets inherit from <classname>Gadget2</classname>
+          as described in <xref linkend="sect.gadgets"/>, i.e. they operate on
+          two argument types, the main two base classes used are:</para>
+
+          <programlisting>Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > ></programlisting>
+
+          <para>As seen, they take a data array (which is typically of complex
+          float type) and a header describing either the acquisition or the
+          image. These headers are defined in <filename>ismrmrd.h</filename>
+          (from the ISMRM Raw Data format). The definition of
+          <classname>ISMRMRD::AcquisitionHeader</classname> looks like
+          (abbreviated):</para>
+
+          <programlisting>struct EncodingCounters {
+ uint16_t kspace_encode_step_1; 
+ uint16_t kspace_encode_step_2; 
+ uint16_t average;              
+ uint16_t slice;                
+ uint16_t contrast;             
+ uint16_t phase;                
+ uint16_t repetition;           
+ uint16_t set;                  
+ uint16_t segment;              
+ uint16_t user[8];              
+};
+
+struct AcquisitionHeader
+{
+ uint16_t           version;                        
+ uint64_t           flags;                          
+ uint32_t           measurement_uid;                
+ uint32_t           scan_counter;                   
+ uint32_t           acquisition_time_stamp;         
+ uint32_t           physiology_time_stamp[3];       
+ uint16_t           number_of_samples;              
+ uint16_t           available_channels;             
+ uint16_t           active_channels;                
+ uint64_t           channel_mask[16];               
+ uint16_t           discard_pre;                    
+ uint16_t           discard_post;                   
+ uint16_t           center_sample;                  
+ uint16_t           encoding_space_ref;             
+ uint16_t           trajectory_dimensions;          
+ float              sample_time_us;                 
+ float              position[3];                    
+ float              read_dir[3];                    
+ float              phase_dir[3];                    
+ float              slice_dir[3];                    
+ float              patient_table_position[3];      
+ EncodingCounters   idx;                            
+ int32_t            user_int[8];                    
+ float              user_float[8];                 
+};</programlisting>
+
+          <para>It is a simple struct, which mainly serves the purpose of
+          keeping track of a) the encoding properties of a given acquisition
+          (phase ending number, etc.) and b) the spatial position and
+          orientation that the data was acquired from. Different MRI systems
+          have different conventions for how to label data, but in most cases
+          one would be able to convert to this format.</para>
+
+          <para>The <classname>ISMRMRD::ImageHeader</classname> data structure
+          is also just a struct for keeping track of image labels, position,
+          and orientation:</para>
+
+          <programlisting>struct ImageHeader
+{
+uint16_t            version;                        
+ uint64_t            flags;                         
+ uint32_t            measurement_uid;               
+ uint16_t            matrix_size[3];                
+ float               field_of_view[3];              
+ uint16_t            channels;                      
+ float               position[3];                   
+ float               read_dir[3];                    
+ float               phase_dir[3];                    
+ float               slice_dir[3];                    
+ float               patient_table_position[3];     
+ uint16_t            average;                       
+ uint16_t            slice;                         
+ uint16_t            contrast;                      
+ uint16_t            phase;                         
+ uint16_t            repetition;                    
+ uint16_t            set;                           
+ uint32_t            acquisition_time_stamp;        
+ uint32_t            physiology_time_stamp[3];      
+ uint16_t            image_data_type;               
+ uint16_t            image_type;                    
+ uint16_t            image_index;  
+ uint16_t            image_series_index;
+ int32_t             user_int[8];       
+ float               user_float[8];     
+};</programlisting>
+
+          <para/>
+        </sect3>
+
+        <sect3>
+          <title>List of available MRI Gadgets</title>
+
+          <para>This section contains a non-exhaustive list of available MRI
+          Gadgets with a few brief comments on their function. The purpose is
+          to make it easier to read the XML configuration files provided with
+          the Gadgetron and to give some ideas of what modules can be reused
+          in new reconstruction programs.</para>
+
+          <itemizedlist>
+            <listitem>
+              <para><classname>AccumulatorGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>Simple Gadget for accumulating k-space profiles in an
+              array and passing it on to next Gadget. Used for simple
+              Cartesian FT MRI reconstructions.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>AutoScaleGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>Does simple histogram analysis of floating point images
+              passing through and scales them. This is typically used upstream
+              of conversion from floating point to unsigned short
+              images.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>CoilReductionGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>Used to reduce the number of coils in a dataset. Typically
+              used to tune the performance of a given reconstruction by
+              eliminating data. This Gadget is commonly used in conjunction
+              with the <classname>PCACoilGadget</classname> which generates
+              virtual coils based on principal component analysis. The coil
+              reduction can be specified with either a mask or the number of
+              target coils as illustrated below</para>
+
+              <programlisting><gadget>
+ <name>CoilReduction</name>
+ <dll>gadgetroncore</dll>
+ <class>CoilReductionGadget</class>
+ <!-- Keep a max of 16 coils -->
+ <property><name>coils_out</name><value>16</value></property>
+</gadget>
+
+<gadget>
+ <name>CoilReduction</name>
+ <dll>gadgetroncore</dll>
+ <class>CoilReductionGadget</class>
+ <!-- Keep only coil 2,3,4,5 and discard the rest-->
+ <property>
+  <name>coil_mask</name>
+  <value>0 1 1 1 0 0 0 0</value>
+ </property>
+</gadget>
+</programlisting>
+            </listitem>
+
+            <listitem>
+              <para><classname>CropAndCombineGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>This Gadget is used to do a simple RMS coil combination in
+              the image domain and remove 2x oversampling in the first
+              dimension of the image as is commonly used in MRI. This Gadget
+              is intended to be used after FFT of the data.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>ExtractGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>This Gadget is used to extract a given component
+              (magnitude, real, imaginary, phase) from complex images, i.e. it
+              converts complex images to real images containing specific
+              components. The Gadget can be used to extract multiple
+              components using a mask. The bit fields used to define the
+              components are defined as:</para>
+
+              <programlisting>#define GADGET_EXTRACT_MAGNITUDE              (1 << 0) //1
+#define GADGET_EXTRACT_REAL                   (1 << 1) //2
+#define GADGET_EXTRACT_IMAG                   (1 << 2) //4
+#define GADGET_EXTRACT_PHASE                  (1 << 3) //8
+</programlisting>
+
+              <para>To specify the components, you just specify the mask, for
+              example, the following specification would extract magnitude (1)
+              and phase (8):</para>
+
+              <programlisting><gadget>
+ <name>Extract</name>
+ <dll>gadgetroncore</dll>
+ <class>ExtractGadget</class>
+ <property><name>extract_mask</name><value>9</value></property>
+</gadget>
+</programlisting>
+
+              <para>Default behavior is to extract magnitude.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>FFTGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>This Gadget Fourier transforms along the first 3
+              dimensions of the dataset (frequency, phase, partition encoding
+              directions) and passes on the data to the next Gadget.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>FloatToUShortGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>Converts floating point images to unsigned short images.
+              This Gadget would often be used in conjunction with a scaling
+              step (e.g. <classname>AutoScaleGadget</classname>) upstream to
+              ensure that the values will not get clipped or overflow during
+              the conversion to unsigned short. This Gadget does not make any
+              attempt to scale the data, it is assumed to be scaled upon
+              entry.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>GPUCGGoldenRadial</classname>,
+              <classname>GPUCGFixedRadial</classname>
+              (<filename>gadgetroncgsense</filename>):</para>
+
+              <para>These Gadgets perform conjugate gradient based
+              non-Cartesian SENSE reconstruction (<xref
+              linkend="sect.cgsense"/>). The reconstruction behavior can be
+              controlled with number of properties:</para>
+
+              <programlisting><gadget>
+ <name>GPUCGRadial0</name>
+ <dll>gadgetroncgsense</dll>
+ <classname>GPUCGGoldenRadialGadget</classname>
+
+ <property>
+  <name>deviceno</name>
+  <value>0</value>
+ </property>
+ 
+ <property>
+  <name>sliceno</name>
+  <value>0</value>
+ </property>
+ 
+ <property>
+  <name>profiles_per_frame</name>
+  <value>32</value>
+ </property>
+ 
+ <property>
+  <name>shared_profiles</name>
+  <value>0</value>
+ </property>
+
+ <property>
+  <name>number_of_iterations</name>
+  <value>10</value>
+ </property>
+
+ <property>
+  <name>cg_limit</name>
+  <value>1e-6</value>
+ </property>
+
+ <property>
+  <name>oversampling</name>
+  <value>1.5</value>
+ </property>
+
+ <property>
+  <name>kernel_width</name>
+  <value>5.5</value>
+ </property>
+
+ <property>
+  <name>kappa</name>
+  <value>0.1</value>
+ </property>
+
+ <property>
+  <name>pass_on_undesired_data</name>
+  <value>true</value>
+ </property>
+
+</gadget>
+</programlisting>
+            </listitem>
+
+            <listitem>
+              <para><classname>GrappaGadget</classname>,
+              <classname>GrappaUnmixingGadget</classname>
+              (<filename>gadgetrongrappa</filename>):</para>
+
+              <para>These Gadgets are used together to perform 2D Cartesian
+              parallel imaging on the GPU. The
+              <classname>GrappaGadget</classname> is responsible for
+              calculating GRAPPA coefficients and the
+              <classname>GrappeUnmixingGadget</classname> Fourier transforms
+              the raw data and applies the coefficients. The
+              <classname>GrappaGadget</classname> has the ability to use
+              target channel compression, i.e. it can reconstruct using fewer
+              target channels than input channels to improve performance. See
+              <xref linkend="sect.grappa"/> for details. The target channel
+              compression is specificied like this:</para>
+
+              <programlisting><gadget>
+ <name>Grappa</name>
+ <dll>gadgetrongrappa</dll>
+ <class>GrappaGadget</class>
+ <property><name>target_coils</name><value>8</value></property>
+</gadget>
+</programlisting>
+            </listitem>
+
+            <listitem>
+              <para><classname>ImageFinishGadgetSHORT</classname>,
+              <classname>ImageFinishFLOAT</classname>,
+              <classname>ImageFinishCPLX</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>These 3 Gadgets are all template instances of the same
+              <classname>ImageFinishGadget</classname>. The only different
+              between them is that they operate on different types of image
+              data types as indicated by their names. Their purpose is to
+              return the reconstructed images to the output queue of the
+              Gadgetron so that they can be returned to the client.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>MRINoiseAdjustGadget</classname>
+              (<filename>gadgetronmricore</filename>):</para>
+
+              <para>The Gadgetron has two noise pre-whitening Gadgets with
+              similar names <classname>MRINoiseAdjustGadget</classname> and
+              <classname>NoiseAdjustGadget</classname>. They both perform the
+              same operation, which is a) to collect noise adjust data when
+              present, calculate the noise decorrelation matrix, and perform
+              noise decorrelation (when the noise adjustment data is
+              available). The difference between the two Gadgets is that
+              <classname>MRINoiseAdjustGadget</classname> uses BLAS and LAPACK
+              routines to perform the operation, which makes it much faster
+              than the <classname>NoiseAdjustGadget</classname>. The latter
+              Gadget is provided to enable reconstruction on systems where
+              those libraries are not available.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>NoiseAdjustGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>See description of
+              <classname>MRINoiseAdjustGadget</classname>.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>PCACoilGadget</classname>
+              (<filename>gadgetronmricore</filename>):</para>
+
+              <para>This Gadget is used to create virtual channels based on
+              principal component analysis of a portion of the data.
+              Specifically, data is accumulated for the first frame (for each
+              location, i.e. slice) and a principal component analysis is done
+              of this data. Once the PCA coefficients are available, all
+              subsequent data will be transformed into the virtual channel
+              domain and passed on down the Gadget chain. This Gadget is often
+              combined with the
+              <classname>CoilReductionGadget</classname>.</para>
+            </listitem>
+
+            <listitem>
+              <para><classname>RemoveROOversamplingGadget</classname>
+              (<filename>gadgetroncore</filename>):</para>
+
+              <para>Removes the 2x oversampling often used in the readout
+              direction for (Cartesian) MRI.</para>
+            </listitem>
+          </itemizedlist>
+        </sect3>
+      </sect2>
+
+      <sect2>
+        <title>Python Gadgets</title>
+
+        <para>The Gadgetron provides a mechanism to do prototype development
+        in Python. Again, we use MRI as the example application.</para>
+
+        <para>The Python layer is accessed through a set of Python Gadgets
+        that can encapsulate a Python module. This is seen in <xref
+        linkend="fig.pythonoverview"/>, which illustrates a part of a Gadget
+        chain with two Python Gadgets and one C/C++ Gadget. A Gadget chain can
+        have any number of Python Gadgets and Python Gadgets can be mixed with
+        C++ Gadgets.</para>
+
+        <figure xml:id="fig.pythonoverview">
+          <title>Overview of Python Prototyping</title>
+
+          <mediaobject>
+            <imageobject>
+              <imagedata fileref="figs/python.png" format="PNG" width="5in"/>
+            </imageobject>
+          </mediaobject>
+        </figure>
+
+        <para>The Python modules that are encapsulated in the Python Gadgets
+        are expected to have certain characteristics. Specifically, the
+        Gadgets must have at least 3 functions and these functions will be
+        called by the Gadgetron framework at certain specific times:</para>
+
+        <orderedlist>
+          <listitem>
+            <para><emphasis>Gadget reference function</emphasis>. A specific
+            function will be called when the Python Gadget is created. This
+            function is expected to receive a
+            <classname>GadgetReference</classname> which is a class (wrapped
+            in a Python module), which holds a reference to the Gadget, which
+            owns the Python module. The purpose of passing this reference is
+            to allow the Python module to return data to the Gadget when
+            reconstruction outputs are ready. See below for details.</para>
+          </listitem>
+
+          <listitem>
+            <para><emphasis>Configuration function</emphasis>. This function
+            is used to receive the configuration (usually in XML format), when
+            it is passed to the Gadget, i.e. it is the Python equivalent of
+            <function>process_config</function> in the Gadget (see <xref
+            linkend="sect.gadgets"/>).</para>
+          </listitem>
+
+          <listitem>
+            <para><emphasis>Reconstruction function</emphasis>. This function
+            is called when the Gadget receives data, i.e. it is the Python
+            equivalent of the <function>process</function> function in the
+            Gadget (see <xref linkend="sect.gadgets"/>).</para>
+          </listitem>
+        </orderedlist>
+
+        <para>The user can chose the names of these functions freely in the
+        Python module, but the function names must be specified when the
+        Gadget is inserted in the XML configuration:</para>
+
+        <programlisting><gadget>
+ <name>AccReconPython</name>
+ <dll>gadgetronpython</dll>
+ <class>AcquisitionPythonGadget</class>
+
+ <property>
+  <name>python_path</name>
+  <value>/home/myuser/scripts/python</value>
+ </property>
+
+ <property>
+  <name>python_module</name>
+  <value>accumulate_and_recon</value>
+ </property>
+
+ <property>
+  <name>gadget_reference_function</name>
+  <value>set_gadget_reference</value>
+ </property>
+
+ <property>
+  <name>input_function</name>
+  <value>recon_function</value>
+ </property>
+
+ <property>
+  <name>config_function</name>
+  <value>config_function</value>
+ </property>
+</gadget>
+
+</programlisting>
+
+        <para>Notice how the 3 function names are specified through the
+        <varname>gadget_reference_function</varname>,
+        <varname>input_function</varname>, and
+        <varname>config_function</varname> parameter names. Also notice that
+        it is possible to specify a <varname>python_path</varname> to let the
+        Python interpreter know where to search for script. By default, the
+        <filename>gadgetron/lib</filename> is added to the search path.
+        Multiple pathnames can be added by separating the paths with
+        "<filename>;</filename>".</para>
+
+        <para>The Python script referenced in the XML configuration above
+        could look like this:</para>
+
+        <programlisting>import numpy as np
+import GadgetronPythonMRI as g
+import kspaceandimage as ki
+import libxml2
+
+myLocalGadgetReference = g.GadgetReference()
+myBuffer = 0
+myParameters = 0
+myCounter = 1;
+mySeries = 1;
+
+def set_gadget_reference(gadref):
+    global myLocalGadgetReference
+    myLocalGadgetReference = gadref
+
+def config_function(conf):
+    global myBuffer
+    global myParameters
+
+    myParameters = dict()
+
+    doc = libxml2.parseDoc(str(conf))
+    context = doc.xpathNewContext()
+    context.xpathRegisterNs("ismrm", "http://www.ismrm.org/ISMRMRD")
+    myParameters["matrix_x"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodedSpace/ismrm:matrixSize/ismrm:x")[0]).content)
+    myParameters["matrix_y"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodedSpace/ismrm:matrixSize/ismrm:y")[0]).content)
+    myParameters["matrix_z"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodedSpace/ismrm:matrixSize/ismrm:z")[0]).content)
+    myParameters["channels"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:acquisitionSystemInformation/ismrm:receiverChannels")[0]).content)
+    myParameters["slices"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodingLimits/ismrm:slice/ismrm:maximum")[0]).content)+1
+    myParameters["center_line"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodingLimits/ismrm:kspace_encoding_step_1/ismrm:center")[0]).content)
+
+    myBuffer = (np.zeros((myParameters["channels"],myParameters["slices"],myParameters["matrix_z"],myParameters["matrix_y"],(myParameters["matrix_x"]>>1)))).astype('complex64')
+
+def recon_function(acq, data):
+    global myLocalGadgetReference
+    global myBuffer
+    global myParameters
+    global myCounter
+    global mySeries
+
+    line_offset = (myParameters["matrix_y"]>>1)-myParameters["center_line"];
+    myBuffer[:,acq.idx.slice,acq.idx.kspace_encode_step_2,acq.idx.kspace_encode_step_1+line_offset,:] = data
+    
+    if (acq.flags & (1<<7)): #Is this the last scan in slice
+        image = ki.ktoi(myBuffer,(2,3,4))
+        image = image * np.product(image.shape)*100 #Scaling for the scanner
+        #Create a new image header and transfer value
+        img_head = g.ImageHeader()
+        img_head.channels = acq.active_channels
+        img_head.slice = acq.idx.slice
+        g.img_set_matrix_size(img_head, 0, myBuffer.shape[4])
+        g.img_set_matrix_size(img_head, 1, myBuffer.shape[3])
+        g.img_set_matrix_size(img_head, 2, myBuffer.shape[2])
+        g.img_set_position(img_head, 0,g.acq_get_position(acq,0))
+        g.img_set_position(img_head, 1,g.acq_get_position(acq,1))
+        g.img_set_position(img_head, 2,g.acq_get_position(acq,2))
+        g.img_set_read_dir(img_head, 0,g.acq_get_read_dir(acq,0))
+        g.img_set_read_dir(img_head, 1,g.acq_get_read_dir(acq,1))
+        g.img_set_read_dir(img_head, 2,g.acq_get_read_dir(acq,2))
+        g.img_set_phase_dir(img_head, 0,g.acq_get_phase_dir(acq,0))
+        g.img_set_phase_dir(img_head, 1,g.acq_get_phase_dir(acq,1))
+        g.img_set_phase_dir(img_head, 2,g.acq_get_phase_dir(acq,2))
+        g.img_set_slice_dir(img_head, 0,g.acq_get_slice_dir(acq,0))
+        g.img_set_slice_dir(img_head, 1,g.acq_get_slice_dir(acq,1))
+        g.img_set_slice_dir(img_head, 2,g.acq_get_slice_dir(acq,2))
+        g.img_set_patient_table_position(img_head, 0, g.acq_get_patient_table_position(acq,0))
+        g.img_set_patient_table_position(img_head, 1, g.acq_get_patient_table_position(acq,1))
+        g.img_set_patient_table_position(img_head, 2, g.acq_get_patient_table_position(acq,2))
+        img_head.acquisition_time_stamp = acq.acquisition_time_stamp
+        img_head.image_index = myCounter;
+        img_head.image_series_index = mySeries;
+
+        myCounter = myCounter + 1
+        if (myCounter > 5):
+            mySeries = mySeries + 1
+            myCounter = 1
+
+        #Return image to Gadgetron
+        return myLocalGadgetReference.return_image(img_head,image.astype('complex64'))
+
+        #print "Returning to Gadgetron"
+        return 0 #Everything OK
+
+</programlisting>
+
+        <para>There is a lot going on in this script. Let us walk through the
+        different parts and add some explanation. First look at the
+        imports:</para>
+
+        <programlisting>import numpy as np
+import GadgetronPythonMRI as g
+import GadgetronXML
+import kspaceandimage as ki</programlisting>
+
+        <para>All the Python Gadget modules must include
+        <filename>numpy</filename>. The arrays
+        (<classname>NDArray</classname>) are passed to the Python module as
+        <filename>numpy</filename> arrays. The second module
+        <filename>GadgetronPythonMRI</filename> is a Python wrapped version of
+        some of the data structures used in the MRI part of the Gadgetron (see
+        <xref linkend="sect.mrigadgets"/>). Specifically, the
+        <classname>IMRMRD::AcquisitionHeader</classname> and
+        <classname>ISMRMRD::ImageHeader</classname> headers are wrapped as
+        Python types (using Boost Python). The
+        <filename>GadgetronPythonMRI</filename> also contains a wrapped
+        version of the <classname>GadgetReference</classname> class:</para>
+
+        <programlisting>class GadgetReference
+{
+
+ public:
+  GadgetReference();
+  ~GadgetReference();
+  
+  int set_gadget(Gadget* g)
+  {
+    gadget_ = g;
+    return 0;
+  }
+
+  template<class T> int return_data(T header, 
+          boost::python::numeric::array arr);
+
+  int return_acquisition(ISMRMRD::AcquisitionHeader acq, 
+          boost::python::numeric::array arr);
+
+  int return_image(ISMRMRD::ImageHeader img, 
+          boost::python::numeric::array arr);
+
+ protected:
+  Gadget* gadget_;
+
+};
+</programlisting>
+
+        <para>Using the return functions in this class interface, it is
+        possible for the Python module to return data to the Gadget.
+        <filename>GadgetronXML</filename> is a Python module provided with the
+        Gadgetron, which contains some XML helper functions that can (it is
+        not a requirement) be used to parse the XML parameters that the module
+        will receive from <function>process_config</function>.
+        <filename>kspaceandimage</filename> is also a python module provided
+        with the Gadgetron, it contains some simple wrapper functions for
+        performing Fourier transforms (to and from k-space) of MRI data. The
+        following section contains some initialization of global variables in
+        the Python module;</para>
+
+        <programlisting>myRef = g.GadgetReference()
+myBuffer = 0
+myParameters = 0
+myCounter = 1;
+mySeries = 1;</programlisting>
+
+        <para>As described above, each Python module must contain at least 3
+        functions corresponding to the 3 entry points from the Gadgetron
+        framework. The first one of these functions captures the
+        <classname>GadgetReference</classname>:</para>
+
+        <programlisting>def set_gadget_reference(gadref):
+    global myLocalGadgetReference
+    myLocalGadgetReference = gadref
+</programlisting>
+
+        <para>Using this reference, the Python module will be able to return
+        images (or acquisitions) to the Gadget. The next function
+        (<function>config_function</function> processes the configuration data
+        and finally, the <function>recon_function</function> simply takes the
+        data as it comes it and stores it in a buffer. Based on the
+        <varname>flags</varname> field in the header, it is determined when
+        the last acquisition in each slice has arrived. As this happens the
+        buffer is Fourier transformed, an image header is populated, and the
+        result is returned (via the <classname>GadgetReference</classname>) to
+        the Gadgetron where it will be processed by the next Gadget in the
+        chain.</para>
+
+        <para>The Gadgetron distribution comes with a simple Python-based 2D
+        FT MRI reconstruction. The Gadget chain configuration for this
+        reconstruction can be found in
+        <filename>gadgets/python/python.xml</filename>.</para>
+      </sect2>
+
+      <sect2 xml:id="sect.makingnewgadgetlibrary">
+        <title>Making a new Gadget Library</title>
+
+        <para>The easiest way to get started making a new Gadget library is to
+        follow an example. In this example we create a new Gadget library
+        containing a single Gadget; <classname>ThresholdGadget</classname>.
+        Its purpose is to set all values below a certain fraction of the max
+        value to zero.</para>
+
+        <para>New Gadget libraries can either be created in the Gadgetron
+        source tree, which allows easy access to all the other files in the
+        Gadgetron, or they can be made as external libraries that link against
+        an installed Gadgetron system. In this example we do the latter since
+        this creates a new library that does not "taint" the Gadgetron source
+        tree. It is trivial to move the library inside the Gadgetron source
+        tree at some later point in time if desired. We assume that the
+        Gadgetron is installed on the machine that you are working on. The
+        command line entries, etc. correspond to a Linux console. If you are
+        using Windows you have to adjust a bit.</para>
+
+        <para>Start by creating a new folder for the library:</para>
+
+        <screen><prompt>user at mycomputer:~/temp$</prompt> <userinput>mkdir gadgetron_examplelib</userinput>
+user at mycomputer:~/temp$ <userinput>cd gadgetron_examplelib</userinput></screen>
+
+        <para>We start by creating the class
+        <classname>ThresholdGadget</classname>. Create the following 3 files:
+        <filename>ThresholdGadget.h</filename>,
+        <filename>ThresholdGadget.cpp</filename>,
+        <filename>examplelib_export.h</filename> (the last file is just to
+        help us make sure that things work on Windows) with the following
+        content:</para>
+
+        <programlisting>//ThresholdGadget.h
+
+#ifndef THRESHOLDGADGET_H
+#define THRESHOLDGADGET_H
+
+#include "examplelib_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include <complex>
+
+class EXPORTGADGETSEXAMPLE ThresholdGadget : 
+public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+{
+ public:
+  GADGET_DECLARE(ThresholdGadget)
+
+ protected:
+  virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+       GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+  virtual int process_config(ACE_Message_Block* mb);
+
+  float threshold_level_;
+
+};
+
+#endif //THRESHOLDGADGET_H</programlisting>
+
+        <programlisting>//ThresholdGadget.cpp
+
+#include "ThresholdGadget.h"
+
+int ThresholdGadget::process_config(ACE_Message_Block* mb) 
+{
+  threshold_level_ = get_double_value("level");
+  if (threshold_level_ == 0.0) {
+    threshold_level_ = 1.0;
+  }
+
+  return GADGET_OK;
+}
+
+int ThresholdGadget::process( 
+   GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  std::complex<float>* d = 
+    m2->getObjectPtr()->get_data_ptr();
+
+  unsigned long int elements =  
+    m2->getObjectPtr()->get_number_of_elements();
+
+  //First find max
+  float max = 0.0;
+  for (unsigned long int i = 0; i < elements; i++) {
+    if (abs(d[i]) > max) {
+      max = abs(d[i]);
+    }
+  }
+
+  //Now threshold
+  for (unsigned long int i = 0; i < elements; i++) {
+    if (abs(d[i]) < threshold_level_*max) {
+      d[i] = std::complex<float>(0.0,0.0);
+    }
+  }
+
+  //Now pass on image
+  if (this->next()->putq(m1) < 0) {
+     return GADGET_FAIL;
+  }
+
+  return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(ThresholdGadget)</programlisting>
+
+        <programlisting>//examplelib_export.h
+
+#ifndef EXAMPLE_EXPORT_H_
+#define EXAMPLE_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (gadgetronexamplelib_EXPORTS)
+#define EXPORTGADGETSEXAMPLE __declspec(dllexport)
+#else
+#define EXPORTGADGETSEXAMPLE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSEXAMPLE
+#endif
+
+#endif /* EXAMPLE_EXPORT_H_ */
+</programlisting>
+
+        <para>Now that we have the files for the Gadget we need to set up the
+        build environment. In the folder
+        <filename>gadgetron_examplelib</filename> create a file called
+        <filename>CMakeLists.txt</filename> with the following content:</para>
+
+        <programlisting>cmake_minimum_required(VERSION 2.6)
+
+project(examplelib)
+
+if (WIN32)
+ADD_DEFINITIONS(-DWIN32 -D_WIN32 -D_WINDOWS)
+ADD_DEFINITIONS(-DUNICODE -D_UNICODE)
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3")
+endif (WIN32)
+
+###############################################################
+#Bootstrap search for libraries 
+# (We need to find cmake modules in Gadgetron)
+###############################################################
+find_path(GADGETRON_CMAKE_MODULES FindGadgetron.cmake HINTS
+$ENV{GADGETRON_HOME}/cmake
+/usr/local/gadgetron)
+
+if (NOT GADGETRON_CMAKE_MODULES)
+  MESSAGE(FATAL_ERROR "GADGETRON_CMAKE_MODULES cannot be found. 
+   Try to set GADGETRON_HOME environment variable.")
+endif(NOT GADGETRON_CMAKE_MODULES)
+
+set(CMAKE_MODULE_PATH ${GADGETRON_CMAKE_MODULES})
+###############################################################
+
+find_package(Gadgetron REQUIRED)
+find_package(Boost REQUIRED)
+find_package(ACE REQUIRED)
+
+set(CMAKE_INSTALL_PREFIX ${GADGETRON_HOME})
+
+INCLUDE_DIRECTORIES(${ACE_INCLUDE_DIR} 
+     ${Boost_INCLUDE_DIR}
+     ${GADGETRON_INCLUDE_DIR})
+
+LINK_DIRECTORIES(${GADGETRON_LIB_DIR})
+
+ADD_LIBRARY(gadgetronexamplelib SHARED ThresholdGadget.cpp)
+
+TARGET_LINK_LIBRARIES(gadgetronexamplelib 
+                      hondarray 
+                      optimized ${ACE_LIBRARIES} 
+                      debug ${ACE_DEBUG_LIBRARY})
+
+INSTALL (FILES ThresholdGadget.h
+         examplelib_export.h
+         DESTINATION include)
+
+INSTALL(TARGETS gadgetronexamplelib DESTINATION lib)
+
+INSTALL(FILES threshold.xml DESTINATION config)
+</programlisting>
+
+        <para>The last thing we need is the XML configuration file to use when
+        running our new <classname>ThresholdGadget</classname>. In the same
+        folder create the <filename>threshold.xml</filename> file:</para>
+
+        <programlisting><?xml version="1.0" ?>
+<gadgetronStreamConfiguration 
+  xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+  xmlns="http://gadgetron.sf.net/gadgetron"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetroncore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetroncore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetroncore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetroncore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+
+    <!-- This is where we insert our new Gadget -->
+    <gadget>
+      <name>Threshold</name>
+      <dll>gadgetronexamplelib</dll>
+      <classname>ThresholdGadget</classname>
+      <property><name>level</name><value>0.25</value></property>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetroncore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetroncore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
+</programlisting>
+
+        <para>Check that you have 5 files in your folder:</para>
+
+        <screen><prompt>user at mycomputer:gadgetron_examplelib$</prompt> <userinput>ls</userinput>
+CMakeLists.txt
+ThresholdGadget.cpp
+ThresholdGadget.h
+examplelib_export.h
+threshold.xml
+</screen>
+
+        <para>Next, let us create a <filename>build</filename> directory and
+        compile:</para>
+
+        <screen><prompt>user at mycomputer:gadgetron_examplelib$</prompt> <userinput>mkdir build; cd build</userinput></screen>
+
+        <para>In the <filename>build</filename> folder</para>
+
+        <screen><prompt>user at mycomputer:build$</prompt> <userinput>cmake ../</userinput></screen>
+
+        <para>Assuming the <application>cmake</application> process was
+        successful:</para>
+
+        <screen><prompt>user at mycomputer:build$</prompt> <userinput>make</userinput> 
+Scanning dependencies of target gadgetronexamplelib
+
+[100%] Building CXX object \
+    CMakeFiles/gadgetronexamplelib.dir/ThresholdGadget.cpp.o
+
+Linking CXX shared library libgadgetronexamplelib.dylib
+[100%] Built target gadgetronexamplelib
+
+<prompt>user at mycomputer:build$</prompt> <userinput>make install</userinput>
+[100%] Built target gadgetronexamplelib
+Install the project...
+-- Install configuration: ""
+-- Up-to-date: /usr/local/gadgetron/include/ThresholdGadget.h
+-- Up-to-date: /usr/local/gadgetron/include/examplelib_export.h
+-- Installing: /usr/local/gadgetron/lib/libgadgetronexamplelib.so
+-- Up-to-date: /usr/local/gadgetron/config/threshold.xml</screen>
+
+        <para>You may have to use <application>sudo</application> for the
+        <command>make install</command> command depending on your
+        setup.</para>
+
+        <para>You should now be able to run a reconstruction using your new
+        reconstruction chain. Follow the instructions in <xref
+        linkend="sect.simpleexample"/> if you have not yet tried to run a
+        simple reconstruction. After having started up the Gadgetron, run the
+        <application>mriclient</application>:</para>
+
+        <screen>user at mycomputer:~/temp/test_data$ <userinput>mriclient \
+    -d gadgetron_testdata.h5 \ 
+    -c threshold.xml</userinput>
+
+Gadgetron MRI Data Sender
+  -- host            :      localhost
+  -- port            :      9002
+  -- hdf5 file  in   :      gadgetron_testdata.h5
+  -- hdf5 group in   :      simple_gre
+  -- conf            :      theshold.xml
+  -- loop            :      1
+  -- hdf5 file out   :      ./out.h5
+  -- hdf5 group out  :      2012-05-11 12:52:14
+(31540|140170355443520) Connection from 127.0.0.1:9002
+31540, 81, GadgetronConnector, Close Message received
+(31540|140170283570944) Handling close...
+(31540|140170283570944) svc done...
+(31540|140170283570944) Handling close...</screen>
+
+        <para>If you run it again with the <varname>level</varname> parameter
+        set to 0.00000001 (remember to re-install the
+        <filename>threshold.xml</filename> file in
+        <filename>gadgetron/config</filename> by running <command>make
+        install</command>):</para>
+
+        <programlisting>    <gadget>
+      <name>Threshold</name>
+      <dll>gadgetronexamplelib</dll>
+      <class>ThresholdGadget</class>
+      <property><name>level</name><value>0.00000001</value></property>
+    </gadget>
+</programlisting>
+
+        <para>You should get two different results that look something like
+        <xref linkend="fig.examplelib"/>.</para>
+
+        <figure xml:id="fig.examplelib">
+          <title>Result from <classname>ThresholdGadget</classname>
+          experiment</title>
+
+          <mediaobject>
+            <imageobject>
+              <imagedata fileref="figs/examplelibresult.png" format="PNG"
+                         width="6in"/>
+            </imageobject>
+          </mediaobject>
+        </figure>
+
+        <para>If you create interesting Gadget libraries please consider
+        publishing them online to the benefit of the reconstruction community.
+        An easy way to do this is by sending them to the Gadgetron team for us
+        to publish right away on the web and possibly include in a future
+        release of the Gadgetron.</para>
+      </sect2>
+    </sect1>
+
+    <sect1>
+      <title>Gadgetron Clients</title>
+
+      <sect2>
+        <title>Available Clients</title>
+
+        <para>The purpose of this section is to maintain a list over the
+        available clients that are included in the Gadgetron distribution. The
+        current available clients are:</para>
+
+        <itemizedlist>
+          <listitem>
+            <para><application>mriclient</application>:</para>
+
+            <para>This is the standard client for sending MRI data to the
+            Gadgetron using the ISMRM Raw Data format. In order to get usage
+            information for the client, simply run the client with no
+            arguments.</para>
+          </listitem>
+        </itemizedlist>
+      </sect2>
+
+      <sect2>
+        <title>Making a new Client</title>
+
+        <para>The Gadgetron distribution comes with a
+        <classname>GadgetronConnector</classname> class, which can be used to
+        create clients. An example <filename>main.cpp</filename> file for a
+        client could look like:</para>
+
+        <programlisting>
+#include "GadgetMessageInterface.h"
+#include "GadgetronConnector.h"
+
+int main(int argc, char** argv)
+{
+
+  std::string host_name("localhost");
+  std::string port("9002");
+  std::string config_file("threshold.xml");
+  std::string xml_config;
+
+  //Generate some XML configuration in xml_fconfig
+
+  GadgetronConnector con;
+
+  //Register Readers and Writers
+  con.register_writer(....);
+  con.register_reader(....);
+  con.register_reader(....);
+
+  //Open a connection with the gadgetron
+  if (con.open(hostname, port_no) != 0) {
+    //Deal with errors
+  }
+
+  //Tell Gadgetron which XML configuration to run.
+  if (con.send_gadgetron_configuration_file(config_file) != 0) {
+    //Deal with errors
+  }
+
+  if (con.send_gadgetron_parameters(xml_config) != 0) {
+     //Deal with errors
+  }
+
+
+  //Send data
+  while ( .... ) { //some condition
+    GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+      new GadgetContainerMessage<GadgetMessageIdentifier>();
+      
+      //Create data and add to m1
+
+      if (con.putq(m1) == -1) {
+         //Deal with errors
+      }
+  }
+
+  //Put a close package on the queue
+
+  GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+    new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+  m1->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+  if (con.putq(m1) == -1) {
+   //Deal with errors
+  }
+
+  con.wait(); //Wait for recon to finish
+
+  return 0;
+}</programlisting>
+
+        <para>To compile this client, create a
+        <application>cmake</application> file:</para>
+
+        <programlisting>cmake_minimum_required(VERSION 2.6)
+
+project(exampleclient)
+
+if (WIN32)
+ADD_DEFINITIONS(-DWIN32 -D_WIN32 -D_WINDOWS)
+ADD_DEFINITIONS(-DUNICODE -D_UNICODE)
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3")
+endif (WIN32)
+
+###############################################################
+#Bootstrap search for libraries 
+# (We need to find cmake modules in Gadgetron)
+###############################################################
+find_path(GADGETRON_CMAKE_MODULES FindGadgetron.cmake HINTS
+$ENV{GADGETRON_HOME}/cmake
+/usr/local/gadgetron)
+
+if (NOT GADGETRON_CMAKE_MODULES)
+  MESSAGE(FATAL_ERROR "GADGETRON_CMAKE_MODULES cannot be found. 
+   Try to set GADGETRON_HOME environment variable.")
+endif(NOT GADGETRON_CMAKE_MODULES)
+
+set(CMAKE_MODULE_PATH ${GADGETRON_CMAKE_MODULES})
+###############################################################
+
+find_package(Gadgetron REQUIRED)
+find_package(Boost REQUIRED)
+find_package(ACE REQUIRED)
+
+set(CMAKE_INSTALL_PREFIX ${GADGETRON_HOME})
+
+INCLUDE_DIRECTORIES(${ACE_INCLUDE_DIR} 
+     ${Boost_INCLUDE_DIR}
+     ${GADGETRON_INCLUDE_DIR})
+
+LINK_DIRECTORIES(${GADGETRON_LIB_DIR})
+
+add_executable(mygadgetronclient main.cpp)
+
+target_link_libraries(mygadgetronclient 
+      optimized gadgettools debug gadgettools${CMAKE_DEBUG_SUFFIX}
+      tinyxml 
+      optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY})
+
+install(TARGETS mygadgetronclient DESTINATION bin)
+</programlisting>
+
+        <para>Run <application>cmake</application> and follow the normal
+        <command>make</command> and <command>make install</command>
+        instructions (see <xref
+        linkend="sect.makingnewgadgetlibrary"/>).</para>
+      </sect2>
+    </sect1>
+  </chapter>
+
+  <chapter xml:id="sect.exampleapplications">
+    <title>Gadgetron Applications</title>
+
+    <sect1 xml:id="sect.2dftexample">
+      <title>Basic 2D FFT MRI</title>
+
+      <para>A basic example application in the Gadgetron is a simple 2D FT MRI
+      reconstruction. It receives 2D MRI data, collects it into k-space
+      arrays, performs FFT of the data, combines channels (if there are
+      multiple), and returns the images to the client. This example is
+      included in the Gadgetron for testing and demonstration purposes only.
+      It was not intended to be fast or otherwise optimal in any sense.</para>
+
+      <para>The Gadgets for this reconstruction are in the
+      <filename>core</filename> folder and the configuration file to use to
+      run this reconstruction is <filename>default.xml</filename>. The section
+      <xref linkend="sect.simpleexample"/> describes how to run a simple
+      reconstruction using this Gadget chain and how to download data to test
+      it.</para>
+
+      <para>In this section we will take a closer look at the Gadgets in this
+      chain and how they are implemented. The Gadgetron XML configuration file
+      (<filename>default.xml</filename>) looks like this:</para>
+
+      <programlisting><?xml version="1.0" ?>  
+<gadgetronStreamConfiguration 
+  xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+  xmlns="http://gadgetron.sf.net/gadgetron"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetroncore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetroncore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetroncore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetroncore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetroncore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetroncore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetroncore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
+</programlisting>
+
+      <para>The resulting Gadget chain is illustrated in <xref
+      linkend="fig.simple2dft"/>. As described in <xref
+      linkend="sect.streamconfiguration"/> the Gadgetron configuration
+      contains 3 sections: Readers, Writers, and the Stream. In this
+      particular case, there is only one Reader, which received MRI
+      Acquisitions. This data format is described in <xref
+      linkend="sect.mrigadgets"/>. There are 3 Writers registered with this
+      configuration. They are all used to write MRI images, but responsible
+      for the different data types (complex float, float, or unsigned short).
+      In principle this means that this reconstruction is capable of returning
+      3 different types of images, but as is seen from the stream
+      configuration, the only output from this reconstruction will be float
+      format images. However, many reconstructions will have all 3 Writers
+      registered to make it easy to switch formats, i.e. it would be trivial
+      to turn this reconstruction into one that outputs unsigned short images
+      (have a look at the file <filename>default_short.xml</filename>) for an
+      example of how this is done.</para>
+
+      <figure xml:id="fig.simple2dft">
+        <title>Simple 2D FT Reconstruction Chain</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/simple2dft.png" width="4in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <para>As is seen in the Gadgets section of the configuration, this
+      reconstruction uses 5 Gadgets. The first Gadget is responsible for
+      accumulating MRI acquisitions. To accomplish this, it uses an
+      accumulation buffer. When a k-space line arrives at the Gadget, it will
+      be inserted into the k-space buffer and when the last acquisition in a
+      slice/repetition has arrived, it will copy the entire buffer and pass it
+      on to the next Gadget.</para>
+
+      <para>Let's have a look at the definition of the
+      <classname>AccumulatorGadget</classname> class:</para>
+
+      <programlisting>class EXPORTGADGETSCORE AccumulatorGadget : 
+public Gadget2< ISMRMRD::AcquisitionHeader, 
+                hoNDArray< std::complex<float> > >
+{
+  
+ public:
+  GADGET_DECLARE(AccumulatorGadget);
+
+  AccumulatorGadget();
+  ~AccumulatorGadget();
+
+ protected:
+  virtual int process_config(ACE_Message_Block* mb);
+  virtual int process(
+    GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+
+  hoNDArray< std::complex<float> >* buffer_;
+  std::vector<unsigned int> dimensions_;
+
+  int image_counter_;
+  int image_series_;
+
+};
+</programlisting>
+
+      <para>There are a few member variables to help us keep track of the
+      buffer and the data dimensions and the core functionality is implemented
+      in two functions: <function>process_config</function>, is used to set up
+      the buffer, and <function>process</function>, which is responsible for
+      the accumulation of data. Let us examine the
+      <function>process_config</function> function (abbreviated):</para>
+
+      <programlisting linenumbering="numbered">int AccumulatorGadget::process_config(ACE_Message_Block* mb)
+{
+ boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = 
+    parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+ ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+
+ ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+ ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+ ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+ GADGET_DEBUG2("Matrix size: %d, %d, %d\n", 
+                e_space.matrixSize().x(), 
+                e_space.matrixSize().y(), 
+                e_space.matrixSize().z());
+
+ dimensions_.push_back(e_space.matrixSize().x());
+ dimensions_.push_back(e_space.matrixSize().y());
+ dimensions_.push_back(e_space.matrixSize().z());
+
+ slices_ = e_limits.slice().present() ? 
+             e_limits.slice().get().maximum()+1 : 1;
+
+  return GADGET_OK;
+}</programlisting>
+
+      <para>The main purpose of this function is to pull parameters out of the
+      XML portion of the ISMRM Raw Data header in order to set up the buffer.
+      As mentioned in <xref linkend="sect.xmlparameters"/>, the convention is
+      to pass parameters into the Gadgets in XML format. To enable convenient
+      parsing of these parameters, the ISMRMRD library includes a C++ class
+      representation of the header. See
+      <uri>http://ismrmrd.sourceforge.net</uri> for more details.</para>
+
+      <para>Now we are ready to receive and buffer data, which is done by the
+      <function>process</function> function:</para>
+
+      <programlisting>int AccumulatorGadget::
+process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+ GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  if (!buffer_) {
+   dimensions_.push_back(m1->getObjectPtr()->active_channels);
+   dimensions_.push_back(slices_);
+
+   if (!(buffer_ = new hoNDArray< std::complex<float> >())) {
+    GADGET_DEBUG1("Failed create buffer\n");
+    return GADGET_FAIL;
+   }
+
+   if (!buffer_->create(&dimensions_)) {
+    GADGET_DEBUG1("Failed allocate buffer array\n");
+    return GADGET_FAIL;
+   }
+
+   image_series_ = this->get_int_value("image_series");
+
+  }
+
+
+  std::complex<float>* b =
+    buffer_->get_data_ptr();
+
+  std::complex<float>* d =
+    m2->getObjectPtr()->get_data_ptr();
+
+  int samples =  m1->getObjectPtr()->number_of_samples;
+  int line = m1->getObjectPtr()->idx.kspace_encode_step_1;
+  int partition = m1->getObjectPtr()->idx.kspace_encode_step_2;
+  int slice = m1->getObjectPtr()->idx.slice;
+
+  if (samples > static_cast<int>(dimensions_[0])) {
+   GADGET_DEBUG1("Wrong number of samples received\n");
+   return GADGET_FAIL;
+  }
+
+  size_t offset= 0;
+  //Copy the data for all the channels
+  for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+    offset = 
+      slice*dimensions_[0]*dimensions_[1]*dimensions_[2]*dimensions_[3] +
+      c*dimensions_[0]*dimensions_[1]*dimensions_[2] +
+      partition*dimensions_[0]*dimensions_[1] +
+      line*dimensions_[0] + (dimensions_[0]>>1)-m1->getObjectPtr()->center_sample;
+    
+    memcpy(b+offset,
+     d+c*samples,
+     sizeof(std::complex<float>)*samples);
+  }
+  
+  bool is_last_scan_in_slice = 
+     ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+  
+  if (is_last_scan_in_slice) {
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = 
+      new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+    
+    cm1->getObjectPtr()->flags = 0;
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* cm2 = 
+      new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+    
+    cm1->cont(cm2);
+    
+    std::vector<unsigned int> img_dims(4);
+    img_dims[0] = dimensions_[0];
+    img_dims[1] = dimensions_[1];
+    img_dims[2] = dimensions_[2];
+    img_dims[3] = dimensions_[3];
+    
+    if (!cm2->getObjectPtr()->create(&img_dims)) {
+      GADGET_DEBUG1("Unable to allocate new image array\n");
+      cm1->release();
+      return -1;
+    }
+    
+    size_t data_length = dimensions_[0]*dimensions_[1]*
+      dimensions_[2]*dimensions_[3];
+    
+    offset = slice*data_length;
+    
+    memcpy(cm2->getObjectPtr()->get_data_ptr(),b+offset,
+    sizeof(std::complex<float>)*data_length);
+    
+    cm1->getObjectPtr()->matrix_size[0]     = img_dims[0];
+    cm1->getObjectPtr()->matrix_size[1]     = img_dims[1];
+    cm1->getObjectPtr()->matrix_size[2]     = img_dims[2];
+    cm1->getObjectPtr()->channels           = img_dims[3];
+    cm1->getObjectPtr()->slice   = m1->getObjectPtr()->idx.slice;
+
+    memcpy(cm1->getObjectPtr()->position,
+      m1->getObjectPtr()->position,
+    sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->read_dir,
+      m1->getObjectPtr()->read_dir,
+    sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->phase_dir,
+      m1->getObjectPtr()->phase_dir,
+    sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->slice_dir,
+      m1->getObjectPtr()->slice_dir,
+    sizeof(float)*3);
+ 
+    memcpy(cm1->getObjectPtr()->patient_table_position,
+      m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+    cm1->getObjectPtr()->image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+    cm1->getObjectPtr()->image_index = ++image_counter_;
+    cm1->getObjectPtr()->image_series_index = image_series_;
+
+    if (this->next()->putq(cm1) < 0) {
+     return GADGET_FAIL;
+    }
+  } 
+
+  m1->release();
+  return GADGET_OK;
+}
+</programlisting>
+
+      <para>This function has two basic tasks: insert data into the buffer and
+      when enough data is present, copy the buffer and pass it on to next
+      gadget. Additionally, the data buffer is created in this function if it
+      is not already allocated. In this example we choose to allocate the
+      buffer after the first data elements arrive. This allows us to respond
+      to changes in data sizes introduced by upstream Gadgets, e.g. readout
+      downsampling, coil reduction, etc.</para>
+
+      <para>In this case the copying of data is done with a very simple
+      <function>memcpy</function> command. There is a basic check for the
+      image dimensions, but a more robust application may have more checks of
+      the incoming data.</para>
+
+      <para>Once the data is in the buffer, we check to see if we should put
+      out an image. This is done with the <varname>flags</varname> field on
+      the acquisition. Specifically we check if a specific bit
+      (<varname>ISMRMRD::ACQ_LAST_IN_SLICE</varname>) is set.</para>
+
+      <para>If it is determined that this is the last acquisition for this
+      slice, we create a copy of the buffer and pass it on to the next Gadget.
+      Instead of a <classname>ISMRMRD::AcquisitionHeader</classname> we now
+      need an ISMRMRD::ImageHeader to pass along with the data. This header
+      structure is created and populated with fields (orientation, etc.) from
+      the acquisition header before it is passed on to the Gadget in the
+      stream.</para>
+
+      <para>Next Gadget is the <classname>FFTGadget</classname>. Since the
+      k-space buffering has been taken care of, the Fourier transform is a
+      relatively simple task. The <function>process</function> function uses
+      the FFTW wrapper class (<xref linkend="sect.ffttoolbox"/>) to perform
+      the FFT along the first 3 dimensions of the array:</para>
+
+      <programlisting>int FFTGadget::process( 
+GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+  FFT<float>::instance()->ifft(m2->getObjectPtr(),0);
+  FFT<float>::instance()->ifft(m2->getObjectPtr(),1);
+  FFT<float>::instance()->ifft(m2->getObjectPtr(),2);
+
+  if (this->next()->putq(m1) < 0) {
+     return GADGET_FAIL;
+  }
+
+  return GADGET_OK;
+}</programlisting>
+
+      <para>Now that the images have been Fourier transformed, we need to
+      remove the oversampling that is done in the readout dimensions and we
+      need to combine the receiver channels. In this case, we are making some
+      assumptions, i.e. we assume two-fold oversampling in the readout and we
+      are doing a simple RMS coil combination to obtain combined magnitude
+      images. We will not repeat the source code here, it can be found in
+      <filename>gadgets/core/CropAndCombineGadget.cpp</filename>.</para>
+
+      <para>Last two remaining steps after the coil combination is to extract
+      the magnitude of the data and return the floating point images to the
+      Gadgetron so that they can be returned to the client. This is
+      accomplished in the <classname>ExtractGadget</classname> and the
+      <classname>ImageFinishGadgetFLOAT</classname>. Both of these Gadgets are
+      described in <xref linkend="sect.mrigadgets"/>.</para>
+    </sect1>
+
+    <sect1 xml:id="sect.grappa">
+      <title>Cartesian 2D Parallel MRI (GRAPPA)</title>
+
+      <para>The Gadgetron contains a high-throughput real-time 2D Cartesian
+      parallel imaging reconstruction (GRAPPA) implemented on the GPU. It is
+      beyond the scope of this manual to review all the algorithmic details of
+      this application, but we will give an overview here as an example of a
+      more complicated reconstruction chain.</para>
+
+      <para>The Gadget chain is defined in the <filename>grappa.xml</filename>
+      and the resulting chain is illustrated in <xref
+      linkend="fig.grappachain"/>.</para>
+
+      <para>To test this configuration, please download the GRAPPA test
+      datasets from <uri type="website"
+      xlink:href="https://sourceforge.net/projects/gadgetron/files/testdata/">https://sourceforge.net/projects/gadgetron/files/testdata/ismrmrd</uri>,
+      where you will find the dataset <filename>grappa_rate2</filename> It is
+      a Cartesian parallel imaging datasets with rate 2 TSENSE type
+      acquisition. Data were acquired with a 32 channel coil.</para>
+
+      <para>In order to run the GRAPPA reconstruction you have to have a CUDA
+      enable GPU on your system and your Gadgetron distribution should be
+      compiled with CUDA and CULA enabled. Please see <xref
+      linkend="sect.installation"/> for details for your specific
+      platform.</para>
+
+      <para>To run the reconstruction, start up your Gadgetron (in its own
+      terminal window) and use the <application>mriclient</application> to
+      send the data from another terminal:</para>
+
+      <screen>user at host:~/temp$ wget http://sourceforge.net/projects/gadgetron/files/testdata/ismrmrd/grappa_rate2.h5
+
+user at host:~/temp$ mriclient \
+    -d grappa_rate2.h5 \
+    -c grappa.xml
+Gadgetron MRI Data Sender
+  -- host            :      localhost
+  -- port            :      9002
+  -- hdf5 file  in   :      gadgetron_testdata.h5
+  -- hdf5 group in   :      gre_tgrappa_rate4
+  -- conf            :      grappa.xml
+  -- loop            :      1
+  -- hdf5 file out   :      ./out.h5
+  -- hdf5 group out  :      2012-05-11 15:43:03
+(32580|140398140757824) Connection from 127.0.0.1:9002
+32580, 81, GadgetronConnector, Close Message received
+(32580|140398068885248) Handling close...
+(32580|140398068885248) svc done...
+(32580|140398068885248) Handling close...
+</screen>
+
+      <para>You should get example images that look similar to the ones in
+      <xref linkend="fig.examplegrapparesult"/>.</para>
+
+      <figure xml:id="fig.grappachain">
+        <title>GRAPPA Reconstruction Chain</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/grappa.png" width="4in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <figure xml:id="fig.examplegrapparesult">
+        <title>GRAPPA Reconstruction Results</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/examplegrapparesult.png" format="PNG"
+                       width="5in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <para>Let's take a closer look at some of the components of this
+      reconstruction application.</para>
+
+      <para>The first Gadget is the <classname>NoiseAdjustGadget</classname>.
+      As described in <xref linkend="sect.mrigadgets"/>, the purpose of this
+      Gadget is to decorrelate the noise in the receiver channels. This
+      improves the parallel imaging performance, especially in cases where
+      there is a large amount of noise in just a few receiver elements. There
+      are two versions of this Gadget, one that uses the BLAS/LAPACK routines
+      for performance improvements and one that implements the same
+      functionality without these optimizations. When you call the included
+      <filename>grappa.xml</filename> configuration, you will use the
+      optimized version. If you do not have BLAS and LAPACK on your system,
+      you can modify the XML configuration to use the one from the
+      <filename>gadgets/core</filename> library.</para>
+
+      <para>Second step is removing the oversampling. This step could also be
+      performed after the reconstruction (as it is done in <xref
+      linkend="sect.2dftexample"/>), but here we opt to remove this excess
+      data to improve downstream performance.</para>
+
+      <para>The purpose of the next two Gadgets
+      (<classname>PCAGadget</classname> and
+      <classname>CoilReductionGadget</classname>) is to a) transform the
+      receiver coils into PCA virtual coils ordered by their information
+      content and b) remove some of the coils to improve downstream
+      performance. The first step is achieved by buffering the first frame of
+      data and then performing a principal component analysis (PCA) on the
+      first frame of data. Based on the determined PCA transformation all data
+      is then subsequently transformed into virtual coils. In the coil
+      reduction gadget we can now simple eliminate the channels that are above
+      a certain number. See <xref linkend="sect.mrigadgets"/> for details on
+      how to control the channel compression.</para>
+
+      <para>The next two Gadgets are responsible for the actual GRAPPA
+      reconstruction. The <classname>GrappaGadget</classname> calculates the
+      GRAPPA coefficients and <classname>GrappaUnmixingGadget</classname>
+      performs the Fourier transform of the raw data and applies the GRAPPA
+      coefficients to the aliased imaged to obtain unaliased images.</para>
+
+      <para>In general it is assumed that the data is acquired in such a way
+      that a set of neighboring frames can be averaged to yield a fully
+      sampled k-space; the data is acquired with a time-interleaved sampling
+      pattern. When enough calibration data is available to calculate GRAPPA
+      coefficients, i.e. when a fully sampled region of k-space is available,
+      the calibration data is sent to a grappa coefficient calculation object
+      (<classname>GrappaWeightsCalculator</classname>).</para>
+
+      <para>The <classname>GrappaWeightsCalculator</classname> is an active
+      object, which picks up weight calculation jobs from an input queue and
+      passes them on to the GPU where it uses toolbox functions to calculate
+      GRAPPA unmixing coefficients. These coefficients are Fourier transformed
+      to image space where they are combined for all coils and stored in a
+      <classname>GrappaWeights</classname> object.</para>
+
+      <para>When the <classname>GrappaGadget</classname> passes on the raw
+      data to the <classname>GrappaUnmixingGadget</classname> it passes a
+      reference to the <classname>GrappaWeights</classname> object which is to
+      be used when performing the unmixing operation. Let's have closer look
+      at the <filename>GrappaUnmixingGadget.h</filename> file:</para>
+
+      <programlisting>struct GrappaUnmixingJob
+{
+ boost::shared_ptr< GrappaWeights<float> > weights_;
+};
+
+class GrappaUnmixingGadget: 
+public Gadget3<GrappaUnmixingJob, 
+               ISMRMRD::ImageHeader, 
+               hoNDArray<std::complex<float> > > 
+{
+public:
+ GADGET_DECLARE(GrappaUnmixingGadget);
+
+ GrappaUnmixingGadget();
+ virtual ~GrappaUnmixingGadget();
+protected:
+ virtual int process(GadgetContainerMessage<GrappaUnmixingJob>* m1,
+   GadgetContainerMessage<ISMRMRD::ImageHeader>* m2, 
+   GadgetContainerMessage<hoNDArray<std::complex<float> > >* m3);
+
+};
+</programlisting>
+
+      <para>We can see that the <classname>GrappaUnmixingGadget</classname> is
+      an example of a Gadget, which takes 3 arguments and the additional
+      argument in this case holds a reference to the unmixing
+      coefficients.</para>
+
+      <para>The <classname>GrappaWeightsCalculator</classname> will update the
+      coefficients as often as it is instructed to do so and the
+      <classname>GrappaGadget</classname> is in charge of determining when an
+      update should be done. Specifically, it monitors the incoming data and
+      when the slice orientation changes, a job will be submitted to update
+      the coefficients. If the slice is not changing, it is in principle OK to
+      continue with the current coefficients, but if data is available and the
+      <classname>GrappaWeightsCalculator</classname> is idle (the queue is
+      empty) a job will be submitted.</para>
+
+      <para>With this design, the data passes through the
+      <classname>GrappaGadget</classname> very quickly and the
+      <classname>GrappaUnmixingGadget</classname> can reconstruction the
+      images very quickly, i.e. it is simply a Fourier transform and an
+      element wise multiplication and sum over the coils. It is in other words
+      designed for very high throughput.</para>
+
+      <para>If the slice orientation changes, new coefficients will be
+      calculated, but this calculation will not be done by the time the data
+      reaches the <classname>GrappaUnmixingGadget</classname> and
+      consequently, the images will be reconstructed with the "old"
+      coefficients until the coefficients are ready. This design ensures low
+      latency, but when the slice changes, aliasing may occur for a few frames
+      until coefficients are updated.</para>
+
+      <para>After the unmixing, the images are scaled and magnitude is
+      extracted before returning images to the client. The
+      <classname>AutoScaleGadget</classname> has been added in this case to
+      ensure that images are in a reasonable range before converting to
+      unsigned short as the output in this case. Automatic image scaling can
+      be problematic, especially when doing quantitative imaging, but it was
+      added in this case to make the reconstruction more robust to data from
+      different sources. A better solution is to only use data where noise
+      calibration data is available and reconstruct SNR scaled images. Based
+      on typical SNR values for MRI images, it is fairly trivial to keep the
+      images in the appropriate range and perform a proper conversion to
+      unsigned short.</para>
+
+      <para>A final comment about the GRAPPA reconstruction is that it allows
+      a second step of channel compression. More specifically, it is possible
+      to reconstruct to a limited number of target channels to further improve
+      performance. Between the upstream and downstream channel compression
+      steps, it is possible to tune the performance of the reconstruction to
+      enable real-time reconstruction on the available hardware.</para>
+    </sect1>
+
+    <sect1 xml:id="sect.cgsense">
+      <title>Non-Cartesian 2D Parallel MRI (SENSE)</title>
+
+      <para>The Gadgetron includes a real-time implementation of a GPU-based
+      real-time non-Cartesian Sense reconstruction published in <citation
+      linkend="sorensen09"><xref linkend="sorensen09"/></citation>. One of the
+      keys to obtaining real-time performance is an efficient GPU
+      implementation of the non-Cartesian Fast Fourier Transform
+      <citation><xref linkend="sorensen08"/></citation>. The application
+      reuses several of the gadgets we have seen in use already for the
+      Cartesian Grappa implementation above (<xref linkend="sect.grappa"/>).
+      An overview of the non-Cartesian Sense gadget chain is given in figure
+      <xref linkend="fig.cgsense"/>. <figure xml:id="fig.cgsense">
+          <title>Gadgetron Chain for Non-Cartesian Sense</title>
+
+          <mediaobject>
+            <imageobject role="html">
+              <imagedata align="left" fileref="figs/cgsense.png" format="PNG"
+                         width="3in"/>
+            </imageobject>
+
+            <imageobject role="fo">
+              <imagedata align="left" fileref="figs/cgsense.png" format="PNG"
+                         width="3in"/>
+            </imageobject>
+
+            <textobject>
+              <phrase>Gadgetron chain for non-Cartesian Sense</phrase>
+            </textobject>
+          </mediaobject>
+        </figure></para>
+
+      <para>The <classname>CGSenseGadget</classname> implements the
+      non-Cartesian Sense reconstruction. It contains a conjugate gradient
+      solver (<xref linkend="sect.linear_solvers"/>) set up with a
+      <classname>nonCartesianSense</classname> image encoding matrix and an
+      <classname>imageOperator</classname> for regularization. Internally it
+      maintains a cyclic buffer of a few seconds of imaging data. It uses this
+      buffer to maintain a fully sampled (i.e. unaliased but blurred) k-space
+      image from which coil sensititivities and regularization images are
+      dynamically estimated. The combination of parallel imaging and image
+      regularization operators allows for alias-suppressed image
+      reconstruction using significant undersampling hereby achieving
+      real-time data acquisition rates per frame. The conjugate gradient
+      solver is able to reconstruct faster than the acquisition time e.g. a
+      192x192 image from 32 coils using 10 solver iterations on newer graphics
+      hardware.</para>
+
+      <para>To test this configuration use the 32 channel radial MRI test
+      dataset (<filename>golden_angle.h5</filename>), which you can download
+      from <uri
+      xlink:href="https://sourceforge.net/projects/gadgetron/files/testdata/mri/">
+      https://sourceforge.net/projects/gadgetron/files/testdata/ismrmrd/</uri>.
+      We assume that you have added <envar>$(GADGETRON_HOME)/bin</envar> to
+      your <envar>PATH</envar> environment variable. You need a CUDA enable
+      GPU on your system and your Gadgetron distribution should be compiled
+      with CUDA and CULA enabled. Please see <xref
+      linkend="sect.installation"/> for details for your specific
+      platform.</para>
+
+      <para>To run the reconstruction; start up
+      <application>gadgetron</application> (in its own terminal window) and
+      use the <application>mriclient</application> to send the data from
+      another terminal. First start
+      <application>gadgetron</application>:</para>
+
+      <screen>user at host$ <userinput>gadgetron</userinput> 
+Configuring services</screen>
+
+      <para>If asked, allow the gadgetron application to allow incoming
+      network connection. Next start the
+      <application>mriclient</application>:</para>
+
+      <screen>user at host:~/temp$ wget http://sourceforge.net/projects/gadgetron/files/testdata/ismrmrd/golden_angle.h5
+
+user at host:~/temp$ mriclient \
+       -d golden_angle.h5 \
+       -c radial_single.xml
+
+Gadgetron MRI Data Sender
+  -- host            :      localhost
+  -- port            :      9002
+  -- hdf5 file  in   :      gadgetron_testdata.h5
+  -- hdf5 group in   :      gre_golden_angle
+  -- conf            :      radial_single.xml
+  -- loop            :      1
+  -- hdf5 file out   :      ./out.h5
+  -- hdf5 group out  :      2012-05-11 15:47:22
+(32608|139797448419136) Connection from 127.0.0.1:9002
+32608, 81, GadgetronConnector, Close Message received
+(32608|139797376546560) Handling close...
+(32608|139797376546560) svc done...
+(32608|139797376546560) Handling close...
+
+</screen>
+
+      <para>Your current folder now holds the reconstructed images in the
+      <filename>out.h5</filename> HDF5 file. They will look something like the
+      one depicted in <xref linkend="fig.examplecgsenseresult"/>. <figure
+          xml:id="fig.examplecgsenseresult">
+          <title>Non-Cartesian Sense Reconstruction Results</title>
+
+          <mediaobject>
+            <imageobject>
+              <imagedata fileref="figs/examplecgsenseresult.png" format="PNG"
+                         width="3in"/>
+            </imageobject>
+          </mediaobject>
+        </figure></para>
+    </sect1>
+  </chapter>
+
+  <chapter xml:id="sect.standalone_applications">
+    <title>Standalone Applications</title>
+
+    <para>This chapter demonstrates through a few examples how to use the
+    Gadgetron toolboxes (<xref linkend="sect.toolboxes"/>) to build standalone
+    applications outside the streaming framework. You need a CUDA enabled GPU
+    on your system and your Gadgetron distribution should be compiled with
+    CUDA (and CULA) enabled. Then the examples are automatically build with
+    the Gadgetron and binaries should consequently be available in
+    <envar>$GADGETRON_HOME/bin</envar>.</para>
+
+    <sect1 xml:id="sect.image_denoising">
+      <title>Image Denoising</title>
+
+      <para>This example uses the unconstraint Split Bregman solver for total
+      variation based 2D image denoising. The encoding matrix is defined as an
+      <classname>identityOperator</classname> and a
+      <classname>partialDerivativeOperator</classname> is used for each of the
+      two spatial directions to implement the total variation regularization
+      term. The two partial derivatives are added as a "group" of
+      regularization operators to implement isotropic denoising.
+      Alternatively, by changing a few lines of code they can be added as
+      individual regularization operators instead to implement anisotropic
+      denoising.</para>
+
+      <para>The full source code for the example can be found at
+      <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/denoising/2d/denoise_TV.cpp</filename>.</para>
+
+      <para>You can download some noisy Shepp-Logan phantom test datasets from
+      <uri
+      xlink:href="https://sourceforge.net/projects/gadgetron/files/testdata/phantom/shepp_logan/shepp.tar.gz">https://sourceforge.net/projects/gadgetron/files/testdata/phantom/shepp.tar.gz</uri></para>
+
+      <para>In a terminal, go to the folder in which you unpacked the data. We
+      assume that you have added <envar>$(GADGETRON_HOME)/bin</envar> to your
+      <envar>PATH</envar> environment variable.</para>
+
+      <para>Try</para>
+
+      <screen>user at host$ <userinput>denoise_TV -d shepp_logan_256_256_med_noise.real -O 250 -m 1</userinput>
+Running denoising with the following parameters: 
+---------------------------------------------------- 
+  Noisy image file name (.real)  : shepp_logan_256_256_med_noise.real 
+  Result file name               : denoised_image_TV.real 
+  Number of cg iterations        : 20 
+  Number of sb inner iterations  : 1 
+  Number of sb outer iterations  : 250 
+  Regularization weight (mu)     : 1 
+---------------------------------------------------- 
+...
+user at host$</screen>
+
+      <para>which runs 250 iterations of the solver with a regularization
+      weight of 1. The output is saved in the current folder in the file
+      <filename>denoised_image_TV.real</filename>.</para>
+
+      <para>The noisy and denoised phantom is depicted below.</para>
+
+      <figure xml:id="fig.noisy2d">
+        <title>A noisy version of the Shepp-Logan phantom</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/shepp_noisy.png" format="PNG" width="2in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <figure>
+        <title>Result after total variation denoising</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/shepp_denoised.png" format="PNG"
+                       width="2in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <para>Running <application>denoise_TV</application> with no arguments
+      prints out a brief usage description. We leave it as an exercise to run
+      the algorithm with various settings. The data file you downloaded
+      contains two further dataset (with lower and higher noise levels
+      respectively) to try out as well.</para>
+    </sect1>
+
+    <sect1>
+      <title>Image Deblurring</title>
+
+      <para>This example uses 1) the linear least squares solver, and 2) the
+      constraint Split Bregman solver for image deblurring. The encoding
+      matrix is defined as a <classname>convolutionOperator</classname>. A
+      <classname>partialDerivativeOperator</classname> is added for each of
+      the two spatial directions as regularization terms.</para>
+
+      <para>We reuse the Shepp-Logan data from the image denoising experiment
+      above (<xref linkend="sect.image_denoising"/>).</para>
+
+      <para>First we generate a blurry Shepp-Logan phantom by convolution with
+      a Gaussian kernel. This is easily achieved using the method
+      <function>mult_M</function> in the
+      <classname>convolutionOperator</classname>. Source code is provided at
+      <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/deblurring/2d/blur_2d.cpp</filename></para>
+
+      <para>In a terminal, go to the folder in which you unpacked the
+      Shepp-Logan phantom.</para>
+
+      <para>Try</para>
+
+      <screen>user at host$ <userinput>blur_2d -d shepp_logan_256_256_no_noise.real</userinput></screen>
+
+      <para>which generates two complex images;
+      <filename>blurred_image.cplx</filename> and
+      <filename>kernel_image.cplx</filename>. For convenience a corresponding
+      magnitudes image is also saved as
+      <filename>blurred_image.real</filename>.</para>
+
+      <para>Next run the conjugate gradient solver. The source code for the
+      example can be found in
+      <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/deblurring/2d/deblur_2d_cg.cpp</filename>.</para>
+
+      <screen>user at host$ <userinput>deblur_2d_cg -K 1e-4</userinput>
+ Running deblurring with the following parameters: 
+---------------------------------------------------- 
+  Blurred image file name (.cplx)  : blurred_image.cplx 
+  Kernel image file name (.cplx)   : kernel_image.cplx 
+  Result file name                 : cg_deblurred_image.cplx 
+  Number of iterations             : 25 
+  Regularization weight            : 1e-4 
+---------------------------------------------------- 
+Iterating...
+...
+user at host$</screen>
+
+      <para>The result is saved in the current folder in the file
+      <filename>cg_deblurred_image.cplx</filename>. A magnitudes image is also
+      saved as <filename>cg_deblurred_image.real</filename>.</para>
+
+      <para>Next run the constraint Split Bregman solver. The source code for
+      the example can be found in
+      <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/deblurring/2d/deblur_2d_sb.cpp</filename>.</para>
+
+      <screen>user at host$ <userinput>deblur_2d_sb -O 100 -L 0.5 -M 0.5</userinput>
+ Running deblurring with the following parameters: 
+---------------------------------------------------- 
+  Blurred image file name (.cplx)  : blurred_image.cplx 
+  Kernel image file name (.cplx)   : kernel_image.cplx 
+  Result file name                 : sb_deblurred_image.cplx 
+  Number of cg iterations          : 20 
+  Number of sb inner iterations    : 1 
+  Number of sb outer iterations    : 100 
+  Mu                               : 0.5 
+  Lambda                           : 0.5 
+---------------------------------------------------- 
+...
+user at host$</screen>
+
+      <para>The result is saved as
+      <filename>sb_deblurred_image.cplx</filename>. A magnitudes image is also
+      saved as <filename>sb_deblurred_image.real</filename>.</para>
+
+      <para>The blurred and deblurred phantoms are depicted below.</para>
+
+      <figure>
+        <title>Blurry Shepp-Logan phantom</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/shepp_blurred.png" format="PNG"
+                       width="2in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <figure>
+        <title>Deblurred phantom from the Conjugate Gradient solver</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/shepp_deblurred_cg.png" format="PNG"
+                       width="2in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <figure>
+        <title>Deblurred phantom from the constrained Split Bregman
+        solver</title>
+
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/shepp_deblurred_sb.png" format="PNG"
+                       width="2in"/>
+          </imageobject>
+        </mediaobject>
+      </figure>
+
+      <para>In the present examples no noise was added to the blurred images
+      before the deconvolution. Consequently for the conjugate gradient
+      solver, a very low weight of the regularization term was "sufficient".
+      We leave it as an exercise to run the algorithms with various settings.
+      In particular, try to add noise to the blurred image before the
+      deconvolution to observe the very ill-posed nature of the
+      problem.</para>
+
+      <para><remark>Notice</remark>. If the dimensions of the provided
+      convolution kernel is exactly double that of the provided image, the
+      convolution operator zero-pads the image before the convolution and
+      removes the padding again after. As the convolution operator utilizes
+      FFTs in its implementation, this oversampling is a way of avoiding
+      cyclic boundary conditions during the convolution.</para>
+    </sect1>
+
+    <sect1>
+      <title>Non-Cartesian FFT</title>
+
+      <para>This example shows how to use the forwards and adjoint
+      non-Cartesian Fast Fourier Transform (NFFT and
+      NFFT<superscript>H</superscript> respectively) on a 2D image. The source
+      code can be found at
+      <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/MRI/nfft/2d/main_nfft.cpp</filename>
+      and
+      <envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/MRI/nfft/2d/main_nffth.cpp</filename>.</para>
+
+      <para>We reuse the Shepp-Logan data downloaded for the previous
+      experiments (<xref linkend="sect.image_denoising"/>).</para>
+
+      <para>In the following we run the NFFT followed by the
+      NFFT<superscript>H</superscript>. The image matrix size is
+      256<superscript>2</superscript>. We use an oversampled matrix size of
+      384<superscript>2</superscript>, 128 profiles in k-space (<emphasis
+      role="underline">undersampling</emphasis>) with 384 samples each. The
+      NFFT Kaiser-Bessel convolution kernel width is set to
+      5.5<superscript>2</superscript> (see <xref
+      linkend="sorensen08"/>).</para>
+
+      <screen>user at host$ <userinput>nfft -d shepp_logan_256_256_no_noise.real \ 
+   -o 384 -p 128 -s 384 -k 5.5
+</userinput>
+ Running reconstruction with the following parameters: 
+---------------------------------------------------- 
+  Input image file name (.real)  : shepp_logan_256_256_no_noise.real 
+  Result file name               : samples.cplx 
+  Oversampled matrix size        : 384 
+  Number of profiles             : 128 
+  Samples per profiles           : 384 
+  Kernel width                   : 5.5 
+---------------------------------------------------- 
+Loading image from disk
+Uploading, normalizing and converting to complex
+Initializing plan
+Computing golden ratio radial trajectories
+NFFT preprocessing
+Computing density compensation weights
+Computing nfft (inverse gridding)
+Output result to disk
+user at host$</screen>
+
+      <screen>user at host$ <userinput>nffth -d samples.cplx -m 256 -o 384 -k 5.5</userinput>
+ Running reconstruction with the following parameters: 
+---------------------------------------------------- 
+  Input samples file name (.cplx)  : samples.cplx 
+  Output image file name (.cplx)   : result.cplx 
+  Matrix size                      : 256 
+  Oversampled matrix size          : 384 
+  Kernel width                     : 5.5 
+---------------------------------------------------- 
+Loading samples from disk
+Uploading samples to device
+Initializing plan
+Computing golden ratio radial trajectories
+NFFT preprocessing
+Computing density compensation weights
+Computing nffth (gridding)
+Output result to disk
+user at host$</screen>
+
+      <para>The result is saved in file <filename>result.cplx</filename>. A
+      magnitudes image is saved as <filename>result.real</filename>. As an
+      exercise, experiment with the settings to reduce (or increase) the
+      aliasing.</para>
+
+      <para>The <envar>
+      $(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/MRI/nfft/2d/</filename>
+      folder also contains examples of using the
+      <classname>nfftOperator</classname> in a Conjugate Gradient solver and a
+      Split Bregman solver respectively.</para>
+    </sect1>
+
+    <sect1>
+      <title>Non-Cartesian parallel MRI (SENSE)</title>
+
+      <para>This section demonstrates how to run a standalone non-Cartesian
+      parallel MRI reconstruction similar to the one that was previously shown
+      using the streaming framework infrastructure in section<xref
+      linkend="sect.cgsense"/>. More details can be found in <xref
+      linkend="sorensen09"/>.</para>
+
+      <para>In addition to a regularized linear least squares solution to the
+      reconstruction problem, we furthermore use the Split Bregman solver to
+      obtain the solution with minimum total variation subject to the
+      constraint of the encoding operator (compressed sensing).</para>
+
+      <para>Download a free-breathing cardiac MRI sample dataset from <uri
+      xlink:href="https://sourceforge.net/projects/gadgetron/files/testdata/mri/fb_data.zip">http://sourceforge.net/projects/gadgetron/files/testdata/mri/fb_data.zip</uri></para>
+
+      <para>Source code is found in the files <filename>main_cg.cpp</filename>
+      and <filename>main_sbc.cpp</filename> in directory</para>
+
+      <para><envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/MRI/sense/noncartesian/radial/2d_golden_ratio</filename>.</para>
+
+      <para>Both command lines below produce a 2D image sequence, each image
+      with a matrix size of 192<superscript>2</superscript>
+      (<varname>-m</varname>). 32 projections are used for each frame
+      (<varname>-p</varname>) for a frame rate of roughly 32 profiles/frame *
+      2.5 ms/profile = 80 ms ms/frame or 12.5 frames/s. The reconstruction
+      results are written out in both complex and magnitude data as
+      <filename>result.cplx</filename> and <filename>result.real</filename>
+      respectively.</para>
+
+      <para>If sufficient device memory is available on your GPU (i.e. you are
+      in possession of a high-end card) all frames in the sequence can be
+      reconstructed concurrently (as a 3D volume). On systems that do not hold
+      enough device memory to reconstruct all frames in parallel, they can
+      instead be reconstructed in several batches. The <varname>-f</varname>
+      option to the command lines below indicate the number of frames that are
+      reconstructed per batch. A negative value indicates "all". If the
+      command below fails to complete due to lack of device memory, try
+      running with argument <varname>-f 8</varname> (or an even smaller
+      number) instead.</para>
+
+      <para>The following output was obtained on a Geforce GTX 480 GPU.</para>
+
+      <screen>user at host$ radial_sense_cg -d fb_data.cplx -m 192 -o 256 -p 32 -K 0.01
+
+  Running reconstruction with the following parameters: 
+---------------------------------------------------- 
+  Sample data file name                             : fb_data.cplx 
+  Result file name                                  : result.cplx 
+  Matrix size                                       : 192 
+  Oversampled matrix size                           : 256 
+  Profiles per frame                                : 32 
+  Frames per reconstruction (negative meaning all)  : -1 
+  Number of iterations                              : 10 
+  Kernel width                                      : 5.5 
+  Kappa                                             : 0.01 
+---------------------------------------------------- 
+
+Loading data: 18.339 ms
+
+#samples/profile: 256
+#profiles/frame: 32
+#profiles: 2560
+#coils: 4
+#frames/reconstruction: 80
+#profiles/reconstruction: 2560
+#samples/reconstruction: 655360
+
+Filling rhs buffer: 283.675 ms
+Estimating csm: 3.435 ms
+Computing regularization: 0.319 ms
+Computing preconditioning weights: 0.081 ms
+Iterating...
+Iteration 0. rq/rq_0 = 0.453177
+Iteration 1. rq/rq_0 = 0.132643
+Iteration 2. rq/rq_0 = 0.0413432
+Iteration 3. rq/rq_0 = 0.0144378
+Iteration 4. rq/rq_0 = 0.00681063
+Iteration 5. rq/rq_0 = 0.00450857
+Iteration 6. rq/rq_0 = 0.00342872
+Iteration 7. rq/rq_0 = 0.00240418
+Iteration 8. rq/rq_0 = 0.00146108
+Iteration 9. rq/rq_0 = 0.000903398
+GPU Conjugate Gradient solve: 2115.7 ms
+Full SENSE reconstruction.: 2188.68 ms
+Writing out result: 50.111 ms
+
+user at host$</screen>
+
+      <screen>user at host$ radial_sense_sbc -d fb_data.cplx -m 192 -o 256 -p 32 
+
+  Running reconstruction with the following parameters: 
+---------------------------------------------------- 
+  Sample data file name                             : fb_data.cplx 
+  Result file name                                  : result.cplx 
+  Matrix size                                       : 192 
+  Oversampled matrix size                           : 256 
+  Profiles per frame                                : 32 
+  Frames per reconstruction (negative meaning all)  : -1 
+  Number of cg iterations                           : 20 
+  Number of sb inner iterations                     : 1 
+  Number of sb outer iterations                     : 20 
+  Kernel width                                      : 5.5 
+  Mu                                                : 1.0 
+  Lambda                                            : 2.0 
+---------------------------------------------------- 
+
+Loading data: 16.082 ms
+
+#samples/profile: 256
+#profiles/frame: 32
+#profiles: 2560
+#coils: 4
+#frames/reconstruction 80
+#profiles/reconstruction 2560
+#samples/reconstruction 655360
+
+CSM and regularization estimation: 288.983 ms
+
+...
+
+GPU constrained Split Bregman solve: 57257.4 ms
+Full SENSE reconstruction with TV regularization.: 57330.3 ms
+Writing out result: 50.421 ms
+
+user at host$</screen>
+
+      <para>As all 80 frames are reconstructed in parallel it is
+      straightforward to add temporal regularization to the reconstructions.
+      We leave this as a suggested exercise for the reader.</para>
+
+      <para>For the interested reader, an implementation of
+      <emphasis>kt</emphasis>-Sense can be found in directory</para>
+
+      <para><envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/MRI/sense/noncartesian/radial/2d_golden_ratio_kt</filename>.</para>
+
+      <para>Additionally, the source code for the user interface demonstrated
+      in the <uri
+      xlink:href="http://www.cs.au.dk/~sangild/non-Cartesian_interactive_reconstruction.wmv">movie</uri>
+      accompanying <xref linkend="sorensen09"/> can be found (if you
+      configured <command>cmake</command> to inlude Qt support) in
+      direcotry</para>
+
+      <para><envar>$(GADGETRON_SOURCE)</envar><filename>/apps/standalone/gpu/MRI/sense/noncartesian/radial/2d_golden_ratio_gui</filename>.</para>
+    </sect1>
+  </chapter>
+
+  <chapter>
+    <title>Frequently Asked Questions (FAQ)</title>
+
+    <itemizedlist>
+      <listitem>
+        <para><emphasis>Can I make a branching Gadget chain?</emphasis></para>
+
+        <para>The short answer is no. We plan on supporting this in a future
+        release, but it is not quite ready yet.</para>
+      </listitem>
+
+      <listitem>
+        <para><emphasis>How can I help?</emphasis></para>
+
+        <para>We are always looking for people who are interested in helping
+        with the continuing development of the Gadgetron. There are many
+        things you can do:</para>
+
+        <itemizedlist>
+          <listitem>
+            <para>Use it.</para>
+          </listitem>
+
+          <listitem>
+            <para>When you develop new Gadgets or Toolboxes, please consider
+            submitting them to us so that we can include them in the
+            archive.</para>
+          </listitem>
+
+          <listitem>
+            <para>Help us implement some of the future features in <xref
+            linkend="futurefeatures"/>. It is probably a good idea to get in
+            touch with us before you start coding, just in case somebody is
+            already working on it.</para>
+          </listitem>
+        </itemizedlist>
+      </listitem>
+    </itemizedlist>
+  </chapter>
+
+  <appendix xml:id="simplearrayfiles">
+    <title>Simple Array File Format</title>
+
+    <para>When working with the Gadgetron it is often necessary to write files
+    with reconstructed images to disk, either as part of debugging or as the
+    final reconstruction result. We have adopted a very simple
+    multidimensional array file format for this purpose. The main advantage of
+    this file format is its simplicity but there are a number of disadvantages
+    and caveats as well as described in this section.</para>
+
+    <para>The simple array files are made up of a) a header followed by b) the
+    data itself. This layout of data and header is illustrated in <xref
+    linkend="fig.gadgetron.fileformat"/>. The header has a single 32-bit
+    integer to indicate the number of dimensions of the dataset followed by
+    one integer for each dimension to indicate the length of that dimension.
+    The data follows immediately after the header. The data is stored such
+    that the first dimension is the fastest moving dimension, second dimension
+    is second fastest, etc. The header contains no information about the size
+    of each individual data element and consequently the user needs to know
+    what type of data is contained in the array. In general, the Gadgetron
+    uses 3 different types of data and the convention is to use the file
+    extension to indicate the data type in the file:</para>
+
+    <itemizedlist>
+      <listitem>
+        <para>16-bit unsigned short. File extension:
+        <filename>*.short</filename></para>
+      </listitem>
+
+      <listitem>
+        <para>32-bit float. File extension: <filename>*.real</filename></para>
+      </listitem>
+
+      <listitem>
+        <para>32-bit complex float. Two 32-bit floating point values per data
+        element. File extension: <filename>*.cplx</filename></para>
+      </listitem>
+    </itemizedlist>
+
+    <figure xml:id="fig.gadgetron.fileformat">
+      <title>Simple Array File Format</title>
+
+      <mediaobject>
+        <imageobject condition="print">
+          <imagedata align="left" fileref="figs/arrayfileformat.png"
+                     format="PNG" width="2in"/>
+        </imageobject>
+
+        <textobject>
+          <phrase>Simple Array</phrase>
+        </textobject>
+      </mediaobject>
+
+      <caption>
+        <para>The simple array file format has a header followed by the data.
+        The header consists of one 32-bit integer defining the number of
+        dimensions (N-dimensions) followed by N-dimensions 32-bit unsigned
+        integers each defining the length of each dimensions. In the example,
+        the dataset has 4 dimensions and the size of those dimensions is
+        128x128x1x1, i.e. 16384 elements.</para>
+      </caption>
+    </figure>
+
+    <para>The Gadgetron framework provides function for reading these files in
+    C++. The functions are located in
+    <filename>toolboxes/ndarray/hoNDArray_fileio.h</filename> in the Gadgetron
+    source code distribution.</para>
+
+    <para>It is also trivial to read the files into Matlab. Below is a
+    function which detects the data type based on the file extension and reads
+    the file into Matlab.</para>
+
+    <programlisting>
+
+function data = read_gadgetron_array(filename)
+%  data = read_gadgetron_array(filename)
+%  
+%  Reads simplified array format output from the Gadgetron
+%
+%  The datatype is determined by the file extension.
+%     - *.short : 16-bit unsigned integer
+%     - *.real  : 32-bit float
+%     - *.cplx  : 32-bit complex (two 32-bit values per data element)
+%
+%
+if (~exist(filename,'file')),
+    error('File not found.');
+end
+
+[path,name,ext] = fileparts(filename);
+
+ext = lower(ext);
+
+if (~strcmp(ext,'.short') && ~strcmp(ext,'.real') && ~strcmp(ext,'.cplx')),
+   error('Unknown file extension'); 
+end
+
+f = fopen(filename);
+ndims = fread(f,1,'int32'); 
+dims = fread(f,ndims,'int32'); 
+
+switch ext
+    case '.short'
+        data = fread(f,prod(dims),'uint16'); 
+    case '.real'
+        data = fread(f,prod(dims),'float32'); 
+    case '.cplx'
+        data = fread(f,2*prod(dims),'float32'); 
+        data = complex(data(1:2:end),data(2:2:end));
+    otherwise     
+end
+
+fclose(f);
+
+data = reshape(data,dims');
+
+end
+
+  </programlisting>
+  </appendix>
+
+  <appendix xml:id="section.hdf5">
+    <title>HDF5 Files</title>
+
+    <para>The Gadgetron framework is used to process many different types of
+    data and it is cumbersome to add specific read and write routines for all
+    these different kinds of data. Consequently we have chosen to use the
+    generic HDF5 file format. A detailed description of this format can be
+    found at <uri>http://www.hdfgroup.org/HDF5/</uri>.</para>
+
+    <para>The HDF5 file format is much like a file system. Data can be
+    organized hierarchically into groups (like folders in a filesystem) and
+    each file can contain multiple groups and datasets. Each dataset can be an
+    array of any type, e.g. an array of images. There is a generic tool
+    <command>hdfview</command> which can be used to view the files. It is
+    available on all the platforms supported by the Gadgetron framework. HDF5
+    files can also be read easily in newer versions of Matlab.</para>
+
+    <para>As an example of a HDF5 file with MRI raw data can be found at <uri
+    type="website"
+    xlink:href="https://sourceforge.net/projects/gadgetron/files/testdata/">https://sourceforge.net/projects/gadgetron/files/testdata/</uri>.
+    Download the file <filename>gadgetron_testdata.h5</filename>. When opened
+    with <command>hdfview</command>, it should look like <xref
+    linkend="fig.hdfview_testdata"/>. As seen, the file contains 4 groups of
+    data. Each group consists of some data and an XML configuration for the
+    Gadgetron.</para>
+
+    <figure xml:id="fig.hdfview_testdata">
+      <title>Examining Data with HDFView</title>
+
+      <screenshot>
+        <mediaobject>
+          <imageobject>
+            <imagedata fileref="figs/hdfview_mri_testdata.png" width="6in"/>
+          </imageobject>
+        </mediaobject>
+      </screenshot>
+    </figure>
+
+    <para>HDF5 Files can also be used to store images. Several of the
+    Gadgetron clients included with the framework save images in HDF5 files.
+    An example of viewing the output of a reconstruction can be seen in <xref
+    linkend="fig.hdfview_image"/>.</para>
+
+    <figure xml:id="fig.hdfview_image">
+      <title>Viewing Images in HDF5 Files</title>
+
+      <mediaobject>
+        <imageobject>
+          <imagedata fileref="figs/hdfview_image_view.png" width="5.5in"/>
+        </imageobject>
+      </mediaobject>
+    </figure>
+
+    <para>Images saved by Gadgetron clients are saved as arrays in the HDF5
+    files. Due to the array storage conventions in the Gadgetron environment,
+    the first dimension is the slowest varying dimension in the arrays and the
+    last dimension is the fastest varying dimension. That means that an array
+    with 10 images with dimensions 128x128 would be stored in a variable in
+    the HDF5 file with dimensions 10x1x128x128 as seen in <xref
+    linkend="fig.hdfview_image"/>. To display the images, right click on the
+    data and choose settings as illustrated in <xref
+    linkend="fig.settings_hdfview"/>.</para>
+
+    <figure xml:id="fig.settings_hdfview">
+      <title>Setting for viewing HDF5 output images.</title>
+
+      <mediaobject>
+        <imageobject>
+          <imagedata fileref="figs/hdfview_image_view_setting.png" width="3in"/>
+        </imageobject>
+      </mediaobject>
+    </figure>
+
+    <para>The HDF5 files can also be read with Matlab. The images in the file
+    above could be read with:<programlisting>>> images = h5read('out.h5','/2012-05-11 10:57:48/data_0');
+>> size(images)
+
+ans =
+
+   128   128     1    10
+
+>> imagesc(images(:,:,1,1));colormap(gray) 
+</programlisting></para>
+  </appendix>
+
+  <appendix xml:id="futurefeatures">
+    <title>Future Features</title>
+
+    <para>The Gadgetron is evolving continuously and there are many things
+    still that we would like to include but have not yet had the time to do.
+    This appendix serves as a to-do list of features to be implement as we go
+    along.</para>
+
+    <itemizedlist>
+      <listitem>
+        <para>Branching Gadget chains. There is currently no ability to branch
+        and collect in the Gadgetron.</para>
+      </listitem>
+
+      <listitem>
+        <para>Persistent memory storage across Gadget chains.</para>
+      </listitem>
+
+      <listitem>
+        <para>Matlab Gadgets. It would be great to have a way to encapsulate
+        Matlab code in a Gadget similar to the way that the Python Gadgets
+        work.</para>
+      </listitem>
+    </itemizedlist>
+  </appendix>
+
+  <bibliography>
+    <biblioentry role="article" xml:id="hansen12">
+      <abbrev>HANSEN12</abbrev>
+
+      <biblioset role="article">
+        <authorgroup>
+          <author>
+            <personname><firstname>M. S.</firstname>
+            <surname>Hansen</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>T. S.</firstname>
+            <surname>Sørensen</surname></personname>
+          </author>
+        </authorgroup>
+
+        <title>Gadgetron: An Open Source Framework for Medical Image
+        Reconstruction</title>
+      </biblioset>
+
+      <biblioset role="journal">
+        <title>Magnetic Resonance in Medicine</title>
+
+        <volumenum>Submitted</volumenum>
+
+        <pubdate>2012</pubdate>
+      </biblioset>
+    </biblioentry>
+
+    <biblioentry role="article" xml:id="hansen08">
+      <abbrev>HANSEN08</abbrev>
+
+      <biblioset role="article">
+        <authorgroup>
+          <author>
+            <personname><firstname>M. S.</firstname>
+            <surname>Hansen</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>D.</firstname>
+            <surname>Atkinson</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>T.
+            S.</firstname><surname>Sørensen</surname></personname>
+          </author>
+        </authorgroup>
+
+        <title>Cartesian SENSE and k-t SENSE reconstruction using commodity
+        graphics hardware</title>
+      </biblioset>
+
+      <biblioset role="journal">
+        <title>Magnetic Resonance in Medicine</title>
+
+        <volumenum>59</volumenum>
+
+        <issuenum>3</issuenum>
+
+        <pagenums>463-468</pagenums>
+
+        <pubdate>2008</pubdate>
+      </biblioset>
+    </biblioentry>
+
+    <biblioentry role="article" xml:id="sorensen08">
+      <abbrev>SANGILD08</abbrev>
+
+      <biblioset role="article">
+        <authorgroup>
+          <author>
+            <personname><firstname>T. S.</firstname>
+            <surname>Sørensen</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>T.</firstname>
+            <surname>Schaeffter</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>K. O.</firstname>
+            <surname>Noe</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>M. S.</firstname>
+            <surname>Hansen</surname></personname>
+          </author>
+        </authorgroup>
+
+        <title>Accelerating the nonequispaced fast fourier transform on
+        commodity graphics hardware</title>
+      </biblioset>
+
+      <biblioset role="journal">
+        <title>IEEE Trans Med Imaging</title>
+
+        <volumenum>27</volumenum>
+
+        <issuenum>4</issuenum>
+
+        <pagenums>538-47</pagenums>
+
+        <pubdate>2008</pubdate>
+      </biblioset>
+    </biblioentry>
+
+    <biblioentry role="article" xml:id="sorensen09">
+      <abbrev>SANGILD09</abbrev>
+
+      <biblioset role="article">
+        <authorgroup>
+          <author>
+            <personname><firstname>T. S.</firstname>
+            <surname>Sørensen</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>D.</firstname>
+            <surname>Atkinson</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>T.</firstname>
+            <surname>Schaeffter</surname></personname>
+          </author>
+
+          <author>
+            <personname><firstname>M. S.</firstname>
+            <surname>Hansen</surname></personname>
+          </author>
+        </authorgroup>
+
+        <title>Real-time reconstruction of sensitivity encoded radial magnetic
+        resonance imaging using a graphics processing unit</title>
+      </biblioset>
+
+      <biblioset role="journal">
+        <title>IEEE Trans Med Imaging</title>
+
+        <volumenum>28</volumenum>
+
+        <issuenum>12</issuenum>
+
+        <pagenums>1974-85</pagenums>
+
+        <pubdate>2009</pubdate>
+      </biblioset>
+    </biblioentry>
+  </bibliography>
+</book>
diff --git a/doc/website/Gadgetron.png b/doc/website/Gadgetron.png
new file mode 100644
index 0000000..f39b052
Binary files /dev/null and b/doc/website/Gadgetron.png differ
diff --git a/doc/website/index.html b/doc/website/index.html
new file mode 100644
index 0000000..69ff5df
--- /dev/null
+++ b/doc/website/index.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
+<html>
+<head>
+<style type="text/css">
+div.main
+{
+width=98%;
+margin=1%;
+text-align: center;
+}
+
+table.maintable
+{
+margin-left:auto; 
+margin-right:auto;
+width: 600px;
+}
+
+td.logo
+{
+text-align: center;
+width: 100%;
+}
+
+body
+{
+font-family: Sans-serif;
+}
+
+div.maintext
+{
+  margin-top: 20px;
+  margin-bottom: 20px;
+}
+
+table.resources
+{
+margin-left:auto; 
+margin-right:auto;
+width: 400px;
+text-align: center;
+border: 1px solid black;
+}
+
+div.resources
+{
+  margin-top: 20px;
+  margin-bottom: 20px;
+  width: 100%;
+  text-align: center;
+}
+</style>
+
+<title>Gadgetron</title>
+</head>
+
+<body>
+<div class="main">
+<table class="maintable">
+<tr>
+<td class="logo"><img class="logoimg" src="Gadgetron.png"></td>
+</tr>
+
+<tr><td><hr></td></tr>
+<tr>
+  <td>
+    <div class="maintext">
+    The Gadgetron is an Open Source framework for medical image
+      reconstruction. It has been developed at the National Heart,
+      Lung, and Blood Institute, NIH, Bethesda, MD, USA
+      and at the Department of Computer Science
+      and Department of Clinical Medicine, Aarhus University, Denmark.
+      It is made freely available to the medical image
+      reconstruction community.
+    </div>
+
+    <div class="maintext">
+      <p>
+      The Magnetic Resonance in Medicine (MRM) <a href="http://onlinelibrary.wiley.com/doi/10.1002/mrm.24389/abstract">paper</a> on the Gadgetron
+      is now published. If you use the Gadgetron in a scinetific publication, please cite:</p>
+      <p style="font-style: italic;">Hansen MS, Sørensen TS. Gadgetron: An Open Source Framework for Medical Image Reconstruction. Magn Reson Med. 2012.</p>
+    </div>
+
+    <div class="maintext">
+      Example demonstrations of the framework can be found on our
+      <a href="demo/index.html">demo page</a>. 
+    </div>
+      
+    <div class="maintext">
+      The main portal for access to source code,
+      documentation, discussion groups, etc. is the
+      Sourceforge.net website: <a href="http://sourceforge.net/p/gadgetron/">http://sourceforge.net/p/gadgetron/</a>.
+    </div>
+
+    <div class="maintext">
+      Source code can be found at: <a
+      href="http://sourceforge.net/projects/gadgetron/files/">http://sourceforge.net/projects/gadgetron/files/</a>.
+    </div>
+
+    <div class="maintext">
+    You can also checkout the code from the git archive with:
+    <p style="font-family: monospace;">git clone git://git.code.sf.net/p/gadgetron/gadgetron</p>
+    </div>
+
+    <div class="resources">
+      <table class="resources">
+	<tr><td>Manual</td><td>API Documentation</td></tr>
+	<tr><td><a
+	  href="http://gadgetron.sourceforge.net/1.1alpha1/manual/gadgetron_manual.html">[v1.1alpha1]</a></td>
+	  <td><a
+	  href="http://gadgetron.sourceforge.net/1.1alpha1/api">[v1.1alpha1]</a></td></tr>
+	<tr><td><a
+	  href="http://gadgetron.sourceforge.net/1.0/manual/gadgetron_manual.html">[v1.0]</a></td>
+	  <td><a
+	  href="http://gadgetron.sourceforge.net/1.0/api">[v1.0]</a></td></tr>
+	<tr><td><a
+	  href="http://gadgetron.sourceforge.net/1.0alpha/manual/gadgetron_manual.html">[v1.0alpha]</a></td>
+	  <td><a
+	  href="http://gadgetron.sourceforge.net/1.0alpha/api">[v1.0alpha]</a></td></tr>	
+     </table>
+   </div>
+
+    <div class="maintext">
+      Questions and comments, please contact the authors:
+    </div>
+    <div class="maintext">  
+      Michael Schacht Hansen <a
+      href="mailto:michael.hansen at nih.gov">michael.hansen at nih.gov</a>
+      <br>
+      Thomas Sangild Sørensen
+      <a
+      href="mailto:sangild at cs.au.dk">sangild at cs.au.dk</a>
+    </div>
+    <div class="maintext">
+      Follow us on twitter <a href="http://www.twitter.com/ReconstructThis">@ReconstructThis</a>
+    </div>
+ </td>
+</tr>
+  
+<tr><td><hr></td></tr>
+
+</table>
+</div>
+
+</body>
+</html>
diff --git a/doc/windows_installation/GadgetronWindowsInstallation.ps1 b/doc/windows_installation/GadgetronWindowsInstallation.ps1
new file mode 100644
index 0000000..db1f308
Binary files /dev/null and b/doc/windows_installation/GadgetronWindowsInstallation.ps1 differ
diff --git a/gadgets/.gitignore b/gadgets/.gitignore
new file mode 100644
index 0000000..afaf431
--- /dev/null
+++ b/gadgets/.gitignore
@@ -0,0 +1 @@
+gputest/test
\ No newline at end of file
diff --git a/gadgets/CMakeLists.txt b/gadgets/CMakeLists.txt
new file mode 100644
index 0000000..3e2154d
--- /dev/null
+++ b/gadgets/CMakeLists.txt
@@ -0,0 +1,89 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETS__)
+  link_directories(${Boost_LIBRARY_DIRS})
+endif (WIN32)
+
+include_directories(   
+  ${ACE_INCLUDE_DIR} 
+  ${Boost_INCLUDE_DIR}
+  ${FFTW3_INCLUDE_DIR}
+  ${ISMRMRD_INCLUDE_DIR}
+  ${ISMRMRD_SCHEMA_DIR}
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+  ${XSD_INCLUDE_DIR}
+  ${XERCESC_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/dependencies/tinyxml
+  ${CMAKE_SOURCE_DIR}/apps/gadgetron 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  )
+
+if (MKL_FOUND)
+    MESSAGE("MKL Found for gtPlus ... ")
+    list(APPEND EXTRA_MKL_LIBRARIES mkl_core)
+    if ( USE_OPENMP )
+        list(APPEND EXTRA_MKL_LIBRARIES mkl_intel_thread)
+    endif ( USE_OPENMP )
+
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+add_subdirectory(mri_core)
+
+if (MKL_FOUND)
+    add_subdirectory(gtPlus)
+endif (MKL_FOUND)
+
+if (CUDA_FOUND)
+  message("Cuda found, compiling gpu accelerated gadgets")
+  add_subdirectory(radial)
+  add_subdirectory(spiral)
+  add_subdirectory(sense)
+  add_subdirectory(grappa)
+else (CUDA_FOUND)
+  message("Cuda NOT found, NOT compiling gpu accelerated gadgets")
+endif(CUDA_FOUND)
+
+#find_package(Octave)
+#if (OCTAVE_FOUND)
+#  add_subdirectory(octave)
+#endif(OCTAVE_FOUND)
+
+if (MATLAB_FOUND)
+  message("Compilng MATLAB gadgets")
+  add_subdirectory(matlab)
+endif(MATLAB_FOUND)
+
+# nest the find_package checks to provide more useful error messages
+find_package(Boost COMPONENTS python)
+if (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+    MESSAGE("PYTHON_INCLUDE_DIRS: ${PYTHON_INCLUDE_DIRS}")
+    MESSAGE("PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}")
+    MESSAGE("NUMPY_INCLUDE_DIRS: ${NUMPY_INCLUDE_DIRS}")
+    MESSAGE("Compiling Python Gadgets")
+    add_subdirectory(python)
+else (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+    if(NOT PYTHONLIBS_FOUND)
+        MESSAGE("Python Libraries/Headers NOT found, NOT compiling Python Gadgets")
+    endif(NOT PYTHONLIBS_FOUND)
+    if(NOT NUMPY_FOUND)
+        MESSAGE("NumPy NOT found, NOT compiling Python Gadgets")
+    endif(NOT NUMPY_FOUND)
+    if(NOT Boost_PYTHON_FOUND)
+        MESSAGE("Boost Python NOT found, NOT compiling Python Gadgets")
+    endif(NOT Boost_PYTHON_FOUND)
+endif (Boost_PYTHON_FOUND AND PYTHONLIBS_FOUND AND NUMPY_FOUND)
+
+find_package(DCMTK)
+if(DCMTK_FOUND)
+  message("Compiling DICOM gadget")
+  add_subdirectory(dicom)
+else(DCMTK_FOUND)
+  message("DCMTK NOT found, not compiling DICOM gadget")
+endif(DCMTK_FOUND)
+
+add_subdirectory(cartesian)
+add_subdirectory(moco)
diff --git a/gadgets/cartesian/CMakeLists.txt b/gadgets/cartesian/CMakeLists.txt
new file mode 100644
index 0000000..1960193
--- /dev/null
+++ b/gadgets/cartesian/CMakeLists.txt
@@ -0,0 +1,23 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_CARTESIAN__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+  )
+
+add_library(gadgetron_cartesian SHARED 
+  CartesianToGenericGadget.cpp
+  ${ISMRMRD_XSD_SOURCE})
+
+target_link_libraries(gadgetron_cartesian cpucore 
+  ${ISMRMRD_LIBRARIES} ${XERCESC_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+install (TARGETS gadgetron_cartesian DESTINATION lib)
diff --git a/gadgets/cartesian/CartesianToGenericGadget.cpp b/gadgets/cartesian/CartesianToGenericGadget.cpp
new file mode 100644
index 0000000..f926922
--- /dev/null
+++ b/gadgets/cartesian/CartesianToGenericGadget.cpp
@@ -0,0 +1,97 @@
+#include "CartesianToGenericGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron{
+
+  CartesianToGenericGadget::CartesianToGenericGadget() 
+  {
+    set_parameter(std::string("matrix_size_as_a_multipluple_of").c_str(), "1");
+  }
+
+  CartesianToGenericGadget::~CartesianToGenericGadget() {}
+  
+  int CartesianToGenericGadget::process_config(ACE_Message_Block* mb)
+  {
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    if( cfg.get() == 0x0 ){
+      GADGET_DEBUG1("Unable to parse Ismrmrd header\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    // Enforcement of the matrix size being a multiple of the "warp size"
+    warp_size_ = get_int_value(std::string("matrix_size_as_a_multipluple_of").c_str());
+
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    matrix_size_.push_back( (e_space.matrixSize().x()+warp_size_-1)/warp_size_*warp_size_);
+    matrix_size_.push_back( (e_space.matrixSize().y()+warp_size_-1)/warp_size_*warp_size_);
+
+    center_phase_ = e_limits.kspace_encoding_step_1().get().center();
+
+    return GADGET_OK;
+  }
+
+  int CartesianToGenericGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+
+    // Make a new array as continuation of m1, and pass along
+    //
+
+    size_t samples_per_readout = m1->getObjectPtr()->number_of_samples;
+    size_t center_sample = m1->getObjectPtr()->center_sample;
+    size_t offset_readout = (matrix_size_[0]>>1)-center_sample; // In case of partial Fourier
+    size_t offset_phase = (matrix_size_[1]>>1)-center_phase_; // In case of partial Fourier
+    size_t phase_encode_step = m1->getObjectPtr()->idx.kspace_encode_step_1;
+
+    std::vector<size_t> trajectory_dimensions;
+    trajectory_dimensions.push_back(3);
+    trajectory_dimensions.push_back(samples_per_readout);
+    
+    GadgetContainerMessage< hoNDArray<float> > *cont = new GadgetContainerMessage< hoNDArray<float> >();
+    cont->getObjectPtr()->create(&trajectory_dimensions);
+    m2->cont(cont);
+
+    float *traj_ptr = cont->getObjectPtr()->get_data_ptr();
+
+    for( size_t sample=0; sample<samples_per_readout; sample++ ){
+
+      // trajectory x (normalized to [-0.5;0.5])
+      traj_ptr[sample*3+0] = float(sample+offset_readout)/float(matrix_size_[0])-0.5f;
+
+      // trajectory y (normalized to [-0.5;0.5])
+      traj_ptr[sample*3+1] = float(phase_encode_step+offset_phase)/float(matrix_size_[1])-0.5f;
+
+      // dcw
+      traj_ptr[sample*3+2] = 1.0f;
+    }
+        
+    if (this->next()->putq(m1) < 0) {
+      GADGET_DEBUG1("Failed to put job on queue.\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(CartesianToGenericGadget)
+}
diff --git a/gadgets/cartesian/CartesianToGenericGadget.h b/gadgets/cartesian/CartesianToGenericGadget.h
new file mode 100644
index 0000000..e6f92e5
--- /dev/null
+++ b/gadgets/cartesian/CartesianToGenericGadget.h
@@ -0,0 +1,43 @@
+#ifndef CartesianToGenericGadget_H
+#define CartesianToGenericGadget_H
+#pragma once
+
+#include "gadgetron_cartesian_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+
+#include <ismrmrd.h>
+#include <vector>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_CARTESIAN CartesianToGenericGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(CartesianToGenericGadget);
+
+    CartesianToGenericGadget();
+    virtual ~CartesianToGenericGadget();
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+    
+  private:
+    std::vector<unsigned int> matrix_size_;
+    unsigned short center_phase_;
+
+    // We can enforce the encoding space dimension 
+    // to be a multiple of the "warp size" (required for the gpu nfft)
+    unsigned int warp_size_; 
+  };
+}
+#endif //CartesianToGenericGadget_H
diff --git a/gadgets/cartesian/gadgetron_cartesian_export.h b/gadgets/cartesian/gadgetron_cartesian_export.h
new file mode 100644
index 0000000..9e4dce8
--- /dev/null
+++ b/gadgets/cartesian/gadgetron_cartesian_export.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#if defined (WIN32)
+#ifdef __BUILD_GADGETRON_CARTESIAN__
+#define EXPORTGADGETS_CARTESIAN __declspec(dllexport)
+#else
+#define EXPORTGADGETS_CARTESIAN __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_CARTESIAN
+#endif
diff --git a/gadgets/dicom/CMakeLists.txt b/gadgets/dicom/CMakeLists.txt
new file mode 100644
index 0000000..0639800
--- /dev/null
+++ b/gadgets/dicom/CMakeLists.txt
@@ -0,0 +1,43 @@
+# DCMTK-necessary preprocessor flags
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_CONFIG_H -D_REENTRANT -D_OSF_SOURCE")
+
+set(GT_DICOM_LIBRARIES
+    z
+    ${DCMTK_dcmdata_LIBRARY}
+    ${DCMTK_oflog_LIBRARY}
+    ${DCMTK_ofstd_LIBRARY}
+    m
+    #rt
+    #nsl
+    pthread)
+
+# sanity check:
+#message("DCMTK ${DCMTK_HOME}")
+#message("Include: ${DCMTK_INCLUDE_DIRS}")
+#message("Libraries: ${GT_DICOM_LIBRARIES}")
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core    # for GadgetIsmrmrdReadWrite.h
+    ${DCMTK_INCLUDE_DIRS})
+
+set(GT_DICOM_LIB gadgetron_dicom)
+add_library(${GT_DICOM_LIB} SHARED
+    DicomFinishGadget.cpp
+    DicomImageWriter.cpp
+    ${ISMRMRD_XSD_SOURCE})
+
+target_link_libraries(
+    ${GT_DICOM_LIB}
+    ${ISMRMRD_LIBRARIES}
+    optimized ${ACE_LIBRARIES}
+    debug ${ACE_DEBUG_LIBRARY}
+    ${XERCESC_LIBRARIES}
+    ${GT_DICOM_LIBRARIES})
+
+install(
+    FILES DicomFinishGadget.h DicomImageWriter.h gadgetron_dicom_export.h
+    DESTINATION include)
+
+install(TARGETS ${GT_DICOM_LIB} DESTINATION lib)
+
+install(FILES dicom.xml DESTINATION config)
diff --git a/gadgets/dicom/DicomFinishGadget.cpp b/gadgets/dicom/DicomFinishGadget.cpp
new file mode 100644
index 0000000..25154b5
--- /dev/null
+++ b/gadgets/dicom/DicomFinishGadget.cpp
@@ -0,0 +1,824 @@
+// DICOM includes
+#include "dcmtk/config/osconfig.h"
+#include "dcmtk/ofstd/ofstdinc.h"
+#define INCLUDE_CSTDLIB
+#define INCLUDE_CSTDIO
+#define INCLUDE_CSTRING
+#include "dcmtk/dcmdata/dctk.h"
+#include "dcmtk/dcmdata/dcostrmb.h"
+
+#include <vector>
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "DicomFinishGadget.h"
+
+using namespace std;
+
+// Used for windowing using short ints
+#define PIX_RANGE_MAX    (+32767)
+#define PIX_RANGE_MIN    (-32768)
+
+
+// Writes a DICOM string value at the given location in the header
+// Saves keystrokes
+#define WRITE_DCM_STRING(k, s)    \
+    do {                                                                    \
+        status = dataset->putAndInsertString(k, s);            \
+        if (!status.good()) {                                               \
+            GADGET_DEBUG2("Failed to insert DICOM field (0x%04X,0x%04X) at "\
+                "line %u\n", k.getGroup(), k.getElement(), __LINE__);       \
+            return GADGET_FAIL;                                             \
+        }                                                                   \
+    } while (0)
+
+namespace Gadgetron {
+
+template <typename T>
+int DicomFinishGadget<T>::process_config(ACE_Message_Block* mb)
+{
+    OFCondition status;
+    DcmTagKey key;
+    long BUFSIZE = 1024;
+    char *buf = new char[BUFSIZE];  // used for writing numbers as strings in DCMTK
+
+    // Parse ISMRMRD XML header
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(string(mb->rd_ptr()));
+
+    //GADGET_DEBUG1("Processing XML config in DicomFinishGadget\n");
+
+    // Ensure DICOM dictionary is loaded
+    if (!dcmDataDict.isDictionaryLoaded()) {
+        GADGET_DEBUG1("Dictionary not loaded!  Set DCMDICTPATH\n");
+        return GADGET_FAIL;
+    }
+
+    ISMRMRD::experimentalConditionsType exp_cond = cfg->experimentalConditions();
+
+    if (!cfg->subjectInformation().present()) {
+        GADGET_DEBUG1("Header missing SubjectInformation parameters\n");
+        return GADGET_FAIL;
+    }
+    ISMRMRD::subjectInformationType patient_info = cfg->subjectInformation().get();
+
+    if (!cfg->studyInformation().present()) {
+        GADGET_DEBUG1("Header missing StudyInformation parameters\n");
+        return GADGET_FAIL;
+    }
+    ISMRMRD::studyInformationType study_info = cfg->studyInformation().get();
+
+    if (!cfg->measurementInformation().present()) {
+        GADGET_DEBUG1("Header missing MeasurementInformation parameters\n");
+        return GADGET_FAIL;
+    }
+    ISMRMRD::measurementInformationType meas_info = cfg->measurementInformation().get();
+
+    if (!cfg->acquisitionSystemInformation().present()) {
+        GADGET_DEBUG1("Header missing AcquisitionSystemInformation parameters\n");
+        return GADGET_FAIL;
+    }
+    ISMRMRD::acquisitionSystemInformationType sys_info = cfg->acquisitionSystemInformation().get();
+
+    if (!cfg->sequenceParameters().present()) {
+        GADGET_DEBUG1("Header missing SequenceTiming parameters\n");
+        return GADGET_FAIL;
+    }
+    ISMRMRD::sequenceParametersType seq_info = cfg->sequenceParameters().get();
+
+    // Ensure that the XML header contains the DICOM parameters
+    if (!cfg->dicomParameters().present()) {
+        GADGET_DEBUG1("Header missing DICOM parameters\n");
+        return GADGET_OK;
+    }
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    ISMRMRD::dicomParametersType dcm_params = cfg->dicomParameters().get();
+    ISMRMRD::MRImageModule mr_image(dcm_params.MRImageModule().get());
+
+    DcmDataset *dataset = dcmFile.getDataset();
+    DcmMetaInfo *metainfo = dcmFile.getMetaInfo();
+
+
+    // Store initial Series Number for later
+    if (meas_info.initialSeriesNumber().present()) {
+        this->initialSeriesNumber = meas_info.initialSeriesNumber().get();
+    } else {
+        this->initialSeriesNumber = 0;
+    }
+
+
+    // Set the Application Entity Title in the DICOM Meta Info section
+    // The rest of the Meta Info will be automatically populated by DCMTK
+    if (sys_info.stationName().present()) {
+        status = metainfo->putAndInsertString(DcmTagKey(0x0002,0x0016),
+                sys_info.stationName().get().c_str());
+        if (!status.good()) {
+            GADGET_DEBUG1("Failed to set AET in MetaInfo\n");
+            return GADGET_FAIL;
+        }
+    } else {
+        status = metainfo->putAndInsertString(DcmTagKey(0x0002,0x0016), "none");
+        if (!status.good()) {
+            GADGET_DEBUG1("Failed to set AET in MetaInfo\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // Group Length
+    key.set(0x0008, 0x0000);
+    status = dataset->insertEmptyElement(key);
+    if (status.bad()) {
+        GADGET_DEBUG1("Failed to write 0x0008 Group Length\n");
+        return GADGET_FAIL;
+    }
+
+    // Specific Character Set
+    key.set(0x0008, 0x0005);
+    WRITE_DCM_STRING(key, "ISO_IR 100");
+
+    // Image Type
+    key.set(0x0008, 0x0008);
+    if (mr_image.imageType().present()) {
+        WRITE_DCM_STRING(key, mr_image.imageType().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "ORIGINAL\\PRIMARY\\OTHER");
+    }
+
+    // SOPClassUID
+    key.set(0x0008, 0x0016);
+    WRITE_DCM_STRING(key, UID_MRImageStorage);
+
+    // Study Date
+    key.set(0x0008, 0x0020);
+    snprintf(buf, BUFSIZE, "%04d%02d%02d", study_info.studyDate().year(),
+            study_info.studyDate().month(), study_info.studyDate().day());
+    WRITE_DCM_STRING(key, buf);
+
+    // Series Date
+    key.set(0x0008, 0x0021);
+    snprintf(buf, BUFSIZE, "%04d%02d%02d", meas_info.seriesDate().year(),
+            meas_info.seriesDate().month(), meas_info.seriesDate().day());
+    WRITE_DCM_STRING(key, buf);
+    // Acquisition Date
+    key.set(0x0008, 0x0022);
+    WRITE_DCM_STRING(key, buf);
+    // Content Date
+    key.set(0x0008, 0x0023);
+    WRITE_DCM_STRING(key, buf);
+
+    // Study Time
+    key.set(0x0008, 0x0030);
+    snprintf(buf, BUFSIZE, "%02d%02d%02d", study_info.studyTime().hours(),
+            study_info.studyTime().minutes(), (int)study_info.studyTime().seconds());
+    WRITE_DCM_STRING(key, buf);
+
+    // Series Time
+    key.set(0x0008, 0x0031);
+    snprintf(buf, BUFSIZE, "%02d%02d%02d", meas_info.seriesTime().hours(),
+            meas_info.seriesTime().minutes(), (int)meas_info.seriesTime().seconds());
+    WRITE_DCM_STRING(key, buf);
+
+    // Acquisition Time
+    key.set(0x0008, 0x0032);
+    WRITE_DCM_STRING(key, buf);
+
+    // Content Time
+    key.set(0x0008, 0x0033);
+    WRITE_DCM_STRING(key, buf);
+
+    // Accession Number
+    key.set(0x0008, 0x0050);
+    if (study_info.accessionNumber().present()) {
+        snprintf(buf, BUFSIZE, "%d", (int)study_info.accessionNumber().get());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, 0);
+    }
+
+    // Modality
+    // TODO: this is hardcoded!!
+    key.set(0x0008, 0x0060);
+    WRITE_DCM_STRING(key, "MR");
+
+    // Manufacturer
+    key.set(0x0008, 0x0070);
+    if (sys_info.systemVendor().present()) {
+        WRITE_DCM_STRING(key, sys_info.systemVendor().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "UNKNOWN");
+    }
+
+    // Institution Name
+    key.set(0x0008, 0x0080);
+    if (sys_info.institutionName().present()) {
+        WRITE_DCM_STRING(key, sys_info.institutionName().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "UNKNOWN");
+    }
+
+    // Referring Physician's Name
+    key.set(0x0008, 0x0090);
+    if (study_info.referringPhysicianName().present()) {
+        WRITE_DCM_STRING(key, study_info.referringPhysicianName().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Station Name
+    key.set(0x0008, 0x1010);
+    if (sys_info.stationName().present()) {
+        WRITE_DCM_STRING(key, sys_info.stationName().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Study Description
+    key.set(0x0008, 0x1030);
+    if (study_info.studyDescription().present()) {
+        WRITE_DCM_STRING(key, study_info.studyDescription().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Series Description
+    key.set(0x0008, 0x103E);
+    if (meas_info.seriesDescription().present()) {
+        WRITE_DCM_STRING(key, meas_info.seriesDescription().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Manufacturer's Model Name
+    key.set(0x0008, 0x1090);
+    if (sys_info.systemModel().present()) {
+        WRITE_DCM_STRING(key, sys_info.systemModel().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Referenced SOP Instance UIDs
+    if (dcm_params.referencedImageSequence().present()) {
+        ISMRMRD::referencedImageSequence refs = dcm_params.referencedImageSequence().get();
+        DcmItem *ref_sequence;
+        string ref_uid;
+        for (unsigned int i = 0; i < refs.referencedSOPInstanceUID().size(); i++) {
+            ref_uid = refs.referencedSOPInstanceUID()[i];
+
+            if (ref_uid.length() > 0) {   // Only write non-empty strings
+                if (dataset->findOrCreateSequenceItem(key, ref_sequence, -2 /* append */).good()) {
+                    // Write the Referenced SOPClassUID (MRImageStorage)
+                    key.set(0x0008, 0x1150);
+                    ((DcmDataset *)ref_sequence)->putAndInsertString(key, UID_MRImageStorage);
+                    // Write the Referenced SOPInstanceUID
+                    key.set(0x0008, 0x1155);
+                    ((DcmDataset *)ref_sequence)->putAndInsertString(key, ref_uid.c_str());
+                }
+            }
+        }
+    }
+
+    // Group Length
+    key.set(0x0010, 0x0000);
+    status = dataset->insertEmptyElement(key);
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to write 0x0010 Group Length\n");
+        return GADGET_FAIL;
+    }
+
+    // Patient Name
+    key.set(0x0010, 0x0010);
+    if (patient_info.patientName().present()) {
+        WRITE_DCM_STRING(key, patient_info.patientName().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "None");
+    }
+
+    // Patient ID
+    key.set(0x0010, 0x0020);
+    if (patient_info.patientID().present()) {
+        WRITE_DCM_STRING(key, patient_info.patientID().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "0");
+    }
+
+    // Patient Birthdate
+    key.set(0x0010, 0x0030);
+    if (patient_info.patientBirthdate().present()) {
+        snprintf(buf, BUFSIZE, "%04d%02d%02d", patient_info.patientBirthdate().get().year(),
+                patient_info.patientBirthdate().get().month(), patient_info.patientBirthdate().get().day());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        status = dataset->insertEmptyElement(key);
+    }
+
+    // Patient Sex
+    key.set(0x0010, 0x0040);
+    if (patient_info.patientGender().present()) {
+        if (patient_info.patientGender().get() == "O") {
+            status = dataset->insertEmptyElement(key);
+        }
+        else {
+            WRITE_DCM_STRING(key, patient_info.patientGender().get().c_str());
+        }
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Patient Age
+    key.set(0x0010, 0x1010);
+    if (patient_info.patientBirthdate().present()) {
+        snprintf(buf, BUFSIZE, "%03uY", meas_info.seriesDate().year() -
+                patient_info.patientBirthdate().get().year());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, "000Y");
+    }
+
+    // Patient Weight
+    key.set(0x0010, 0x1030);
+    if (patient_info.patientWeight_kg().present()) {
+        snprintf(buf, BUFSIZE, "%f", patient_info.patientWeight_kg().get());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, "0.0");
+    }
+
+    // Group Length
+    key.set(0x0018, 0x0000);
+    status = dataset->insertEmptyElement(key);
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to write 0x0018 Group Length\n");
+        return GADGET_FAIL;
+    }
+
+    // Scanning Sequence
+    if (mr_image.scanningSequence().present()) {
+        key.set(0x0018, 0x0020);
+        WRITE_DCM_STRING(key, mr_image.scanningSequence().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "RM");
+    }
+
+    // Sequence Variant
+    if (mr_image.sequenceVariant().present()) {
+        key.set(0x0018, 0x0021);
+        WRITE_DCM_STRING(key, mr_image.sequenceVariant().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "NONE");
+    }
+
+    // Scan Options
+    if (mr_image.scanOptions().present()) {
+        key.set(0x0018, 0x0022);
+        WRITE_DCM_STRING(key, mr_image.scanOptions().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "NONE");
+    }
+
+    // Acquisition Type
+    if (mr_image.mrAcquisitionType().present()) {
+        key.set(0x0018, 0x0023);
+        WRITE_DCM_STRING(key, mr_image.mrAcquisitionType().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "2D");
+    }
+
+    // Angio Flag
+    // TODO: hardcoded
+    key.set(0x0018, 0x0025);
+    WRITE_DCM_STRING(key, "N");
+
+    // Slice Thickness
+    // This will need updated if the "reconSpace.fieldOfView_mm.z" field
+    // is changed in the ISMRMRD populating code (client)
+    key.set(0x0018, 0x0050);
+    snprintf(buf, BUFSIZE, "%f", cfg->encoding().front().reconSpace().fieldOfView_mm().z());
+    WRITE_DCM_STRING(key, buf);
+
+    // Repetition Time
+    key.set(0x0018, 0x0080);
+    snprintf(buf, BUFSIZE, "%f", seq_info.TR().front());
+    WRITE_DCM_STRING(key, buf);
+
+    // Echo Time
+    key.set(0x0018, 0x0081);
+    snprintf(buf, BUFSIZE, "%f", seq_info.TE().front());
+    WRITE_DCM_STRING(key, buf);
+
+    // Inversion Time
+    key.set(0x0018, 0x0082);
+    snprintf(buf, BUFSIZE, "%f", seq_info.TI().front());
+    WRITE_DCM_STRING(key, buf);
+
+    // Imaging Frequency in tenths of MHz ???
+    key.set(0x0018, 0x0084);
+    snprintf(buf, BUFSIZE, "%f", (float)exp_cond.H1resonanceFrequency_Hz() / 10000000.);
+    WRITE_DCM_STRING(key, buf);
+
+    // Magnetic Field Strength (T)
+    key.set(0x0018, 0x0087);
+    if (sys_info.systemFieldStrength_T().present()) {
+        snprintf(buf, BUFSIZE, "%f", sys_info.systemFieldStrength_T().get());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, "3.0");
+    }
+
+    // Spacing Between Slices
+    key.set(0x0018, 0x0088);
+    snprintf(buf, BUFSIZE, "%f", cfg->encoding().front().reconSpace().fieldOfView_mm().z());
+    WRITE_DCM_STRING(key, buf);
+
+    // Echo Train Length
+    if (mr_image.echoTrainLength().present()) {
+        key.set(0x0018, 0x0091);
+        snprintf(buf, BUFSIZE, "%ld", (long)mr_image.echoTrainLength().get());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, "1");
+    }
+
+    // Percent Sampling
+    // TODO: hardcoded
+    key.set(0x0018, 0x0093);
+    WRITE_DCM_STRING(key, "100");
+
+    // Percent Phase FOV
+    // TODO: hardcoded
+    key.set(0x0018, 0x0094);
+    WRITE_DCM_STRING(key, "100");
+
+    // Protocol Name
+    if (meas_info.protocolName().present()) {
+        key.set(0x0018, 0x1030);
+        WRITE_DCM_STRING(key, meas_info.protocolName().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "");
+    }
+
+    // Trigger Time
+    if (mr_image.triggerTime().present()) {
+        key.set(0x0018, 0x1060);
+        snprintf(buf, BUFSIZE, "%f", mr_image.triggerTime().get());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, "0.0");
+    }
+
+    // Reconstruction Diameter (FOV)
+    // TODO: hmm
+    key.set(0x0018, 0x1100);
+
+    // Frequency Encoding Direction
+    if (mr_image.freqEncodingDirection().present()) {
+        key.set(0x0018, 0x1312);
+        WRITE_DCM_STRING(key, mr_image.freqEncodingDirection().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "ROW");
+    }
+
+    // Flip Angle
+    if (mr_image.flipAngle_deg().present()) {
+        key.set(0x0018, 0x1314);
+        snprintf(buf, BUFSIZE, "%d", (int)mr_image.flipAngle_deg().get());
+        WRITE_DCM_STRING(key, buf);
+    } else {
+        WRITE_DCM_STRING(key, "0");
+    }
+
+    // Patient Position
+    key.set(0x0018, 0x5100);
+    WRITE_DCM_STRING(key, meas_info.patientPosition().c_str());
+
+    // Group Length
+    key.set(0x0020, 0x0000);
+    status = dataset->insertEmptyElement(key);
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to write 0x0020 Group Length\n");
+        return GADGET_FAIL;
+    }
+
+    // Study Instance UID
+    key.set(0x0020, 0x000D);
+    WRITE_DCM_STRING(key, dcm_params.studyInstanceUID().c_str());
+
+    // Study ID
+    if (study_info.studyID().present()) {
+        key.set(0x0020, 0x0010);
+        WRITE_DCM_STRING(key, study_info.studyID().get().c_str());
+    } else {
+        WRITE_DCM_STRING(key, "0");
+    }
+
+    // Store Series Instance UID for later
+    if (dcm_params.seriesInstanceUIDRoot().present()) {
+        seriesIUIDRoot = dcm_params.seriesInstanceUIDRoot().get();
+    }
+
+    // Frame of Reference UID
+    if (dcm_params.frameOfReferenceUID().present()) {
+        key.set(0x0020, 0x0052);
+        WRITE_DCM_STRING(key, dcm_params.frameOfReferenceUID().get().c_str());
+    }
+
+    /****************************************/
+    // Group Length
+    key.set(0x0028, 0x0000);
+    status = dataset->insertEmptyElement(key);
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to write 0x0028 Group Length\n");
+        return GADGET_FAIL;
+    }
+
+    // Samples Per Pixel
+    key.set(0x0028, 0x0002);
+    // TODO: hardcoded
+    WRITE_DCM_STRING(key, "1");
+
+    // Photometric Interpretation
+    key.set(0x0028, 0x0004);
+    // TODO: hardcoded
+    WRITE_DCM_STRING(key, "MONOCHROME2");
+
+    // Pixel Spacing (Array of len 2)
+    key.set(0x0028, 0x0030);
+    float pixel_spacing_X = r_space.fieldOfView_mm().x() / r_space.matrixSize().x();
+    float pixel_spacing_Y = r_space.fieldOfView_mm().y() / r_space.matrixSize().y();
+    snprintf(buf, BUFSIZE, "%.3f\\%.3f", pixel_spacing_X, pixel_spacing_Y);
+    WRITE_DCM_STRING(key, buf);
+
+    // Bits Allocated
+    key.set(0x0028, 0x0100);
+    WRITE_DCM_STRING(key, "16");
+    // Bits Stored
+    key.set(0x0028, 0x0101);
+    WRITE_DCM_STRING(key, "16");
+    // High Bit
+    key.set(0x0028, 0x0102);
+    WRITE_DCM_STRING(key, "15");
+    // Pixel Representation
+    key.set(0x0028, 0x0103);
+    WRITE_DCM_STRING(key, "1");
+
+    //GADGET_DEBUG1("Finished populating DICOM fields\n");
+
+    /* clean up the buffer we created for snprintf */
+    delete[] buf;
+
+    return GADGET_OK;
+}
+
+template <typename T>
+int DicomFinishGadget<T>::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+        GadgetContainerMessage< hoNDArray< T > >* m2)
+{
+    if (!this->controller_) {
+        ACE_DEBUG( (LM_DEBUG,
+                    ACE_TEXT("Cannot return result to controller, no controller set")) );
+        return -1;
+    }
+
+    GadgetContainerMessage<hoNDArray< ACE_INT16 > > *pixels =
+            new GadgetContainerMessage<hoNDArray< ACE_INT16 > >();
+    boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+    try {
+        pixels->getObjectPtr()->create(dims.get());
+    } catch (bad_alloc& err) {
+        GADGET_DEBUG1("Unable to create short storage in DicomFinishGadget");
+        return GADGET_FAIL;
+    }
+
+    /* create ImageHeader and hoNDArray pointers for better readability */
+    ISMRMRD::ImageHeader *img = m1->getObjectPtr();
+    hoNDArray<ACE_INT16>* data = pixels->getObjectPtr();
+
+    /* grab pointers to both the original and new data arrays
+     * The original is of type T
+     * The new is of type ACE_INT16 */
+    T *src = m2->getObjectPtr()->get_data_ptr();
+    ACE_INT16 *dst = data->get_data_ptr();
+
+    /* Convert/cast each element in the data array
+     * and simultaneously find the min/max pixel value, which
+     * will be used later for some crude windowing */
+    T min_pix_val, max_pix_val, sum_pix_val = 0;
+    if (pixels->getObjectPtr()->get_number_of_elements() > 0) {
+        min_pix_val = src[0];
+        max_pix_val = src[0];
+    }
+    for (unsigned long i = 0; i < pixels->getObjectPtr()->get_number_of_elements(); i++) {
+        T pix_val = src[i];
+        // search for minimum and maximum pixel values
+        if (pix_val < min_pix_val) min_pix_val = pix_val;
+        if (pix_val > max_pix_val) max_pix_val = pix_val;
+        sum_pix_val += pix_val / 4; // scale by 25% to avoid overflow
+
+        // copy/cast the pixel value to a short int
+        dst[i] = static_cast<ACE_INT16>(pix_val);
+    }
+    T mean_pix_val = (sum_pix_val * 4) / pixels->getObjectPtr()->get_number_of_elements();
+
+    /* replace the old 'message2' with the new data */
+    m1->cont(pixels);
+    /* release the old data array */
+    m2->release();
+    /* update the image_data_type.
+     * There is currently no SIGNED SHORT type so this will have to suffice */
+    m1->getObjectPtr()->image_data_type = ISMRMRD::DATA_UNSIGNED_SHORT;
+
+    unsigned int BUFSIZE = 1024;
+    char *buf = new char[BUFSIZE];
+    OFCondition status;
+    DcmTagKey key;
+    DcmDataset *dataset = dcmFile.getDataset();
+
+    // Echo Number
+    // TODO: it is often the case the img->contrast is not properly set
+    // likely due to the allocated ISMRMRD::ImageHeader being uninitialized
+    key.set(0x0018, 0x0086);
+    snprintf(buf, BUFSIZE, "%d", img->contrast);
+    WRITE_DCM_STRING(key, buf);
+
+    // Acquisition Matrix ... Image Dimensions
+    // Defined as: [frequency rows, frequency columns, phase rows, phase columns]
+    // But at this point in the gadget I don't know the frequency encode direction
+    key.set(0x0018, 0x1310);
+    ACE_UINT16 im_dim[4] = {0,0,0,0};
+    /* if (frequence_encode_dir == "ROW")) {
+        // frequency encoding direction is ROW
+        im_dim[1] = img->matrix_size[0];
+        im_dim[2] = img->matrix_size[1];
+    } */
+    // frequency encoding direction is COLUMN
+    im_dim[0] = img->matrix_size[0];
+    im_dim[3] = img->matrix_size[1];
+    status = dataset->putAndInsertUint16Array(key, im_dim, 4);
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to stuff image dimensions\n");
+        return GADGET_FAIL;
+    }
+
+    // Series Number
+    // Only write a number if the image_series_index is positive and non-zero
+    key.set(0x0020, 0x0011);
+    snprintf(buf, BUFSIZE, "%ld", this->initialSeriesNumber * 100 + img->image_series_index);
+    WRITE_DCM_STRING(key, buf);
+
+    // Image Number
+    key.set(0x0020, 0x0013);
+    snprintf(buf, BUFSIZE, "%d", img->image_index + 1);
+    WRITE_DCM_STRING(key, buf);
+
+    // Image Position (Patient)
+    float corner[3];
+
+    corner[0] = img->position[0] -
+            (img->field_of_view[0] / 2.0) * img->read_dir[0] -
+            (img->field_of_view[1] / 2.0) * img->phase_dir[0];
+    corner[1] = img->position[1] -
+            (img->field_of_view[0] / 2.0) * img->read_dir[1] -
+            (img->field_of_view[1] / 2.0) * img->phase_dir[1];
+    corner[2] = img->position[2] -
+            (img->field_of_view[0] / 2.0) * img->read_dir[2] -
+            (img->field_of_view[1] / 2.0) * img->phase_dir[2];
+
+    key.set(0x0020, 0x0032);
+    snprintf(buf, BUFSIZE, "%.4f\\%.4f\\%.4f", corner[0], corner[1], corner[2]);
+    WRITE_DCM_STRING(key, buf);
+
+    // Image Orientation
+    // read_dir, phase_dir, and slice_dir were calculated in
+    // a DICOM/patient coordinate system, so just plug them in
+    key.set(0x0020, 0x0037);
+    snprintf(buf, BUFSIZE, "%.4f\\%.4f\\%.4f\\%.4f\\%.4f\\%.4f",
+            img->read_dir[0], img->read_dir[1], img->read_dir[2],
+            img->phase_dir[0], img->phase_dir[1], img->phase_dir[2]);
+    WRITE_DCM_STRING(key, buf);
+
+    // Slice Location
+    key.set(0x0020, 0x1041);
+    snprintf(buf, BUFSIZE, "%f", img->position[2]);
+    WRITE_DCM_STRING(key, buf);
+
+    // Columns
+    key.set(0x0028, 0x0010);
+    snprintf(buf, BUFSIZE, "%d", img->matrix_size[0]);
+    WRITE_DCM_STRING(key, buf);
+
+    // Rows
+    key.set(0x0028, 0x0011);
+    snprintf(buf, BUFSIZE, "%d", img->matrix_size[1]);
+    WRITE_DCM_STRING(key, buf);
+
+    // Simple windowing using pixel values calculated earlier...
+    int mid_pix_val = (max_pix_val + min_pix_val) / 2;
+    int window_center = (mid_pix_val + mean_pix_val) / 2;
+    int window_width_left = window_center - min_pix_val;
+    int window_width_right = max_pix_val - window_center;
+    int window_width = (window_width_right > window_width_left) ?
+            window_width_right : window_width_left;
+
+    // Window Center
+    key.set(0x0028, 0x1050);
+    snprintf(buf, BUFSIZE, "%d", window_center);
+    WRITE_DCM_STRING(key, buf);
+
+    // Window Width
+    key.set(0x0028, 0x1051);
+    snprintf(buf, BUFSIZE, "%d", window_width);
+    WRITE_DCM_STRING(key, buf);
+
+    // ACR_NEMA_2C_VariablePixelDataGroupLength
+    key.set(0x7fe0, 0x0000);
+    status = dataset->insertEmptyElement(key);
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to write 0x7fe0 Group Length\n");
+        return GADGET_FAIL;
+    }
+
+    // Pixel Data
+    if ((unsigned long)img->matrix_size[0] * (unsigned long)img->matrix_size[1] !=
+                data->get_number_of_elements()) {
+        GADGET_DEBUG1("Mismatch in image dimensions and available data\n");
+        return GADGET_FAIL;
+    }
+    key.set(0x7fe0, 0x0010);
+    status = dataset->putAndInsertUint16Array(key, (unsigned short *)data->get_data_ptr(),
+            data->get_number_of_elements());
+    if (!status.good()) {
+        GADGET_DEBUG1("Failed to stuff Pixel Data\n");
+        return GADGET_FAIL;
+    }
+
+    // Series Instance UID = generated here
+    key.set(0x0020, 0x000E);
+    unsigned short series_number = img->image_series_index + 1;
+
+    // Try to find an already-generated Series Instance UID in our map
+    std::map<unsigned int, string>::iterator it = seriesIUIDs.find(series_number);
+
+    if (it == seriesIUIDs.end()) {
+        // Didn't find a Series Instance UID for this series number
+        char prefix[32];
+        char newuid[96];
+        if (seriesIUIDRoot.length() > 20) {
+            memcpy(prefix, seriesIUIDRoot.c_str(), 20);
+            prefix[20] = '\0';
+            dcmGenerateUniqueIdentifier(newuid, prefix);
+        } else {
+            dcmGenerateUniqueIdentifier(newuid);
+        }
+        seriesIUIDs[series_number] = string(newuid);
+    }
+    WRITE_DCM_STRING(key, seriesIUIDs[series_number].c_str());
+
+    // At a minimum, to put the DICOM image back into the database,
+    // you must change the SOPInstanceUID.
+    key.set(0x0008, 0x0018);        // SOPInstanceUID
+    const char *root;
+    if (seriesIUIDRoot.length() > 0) {
+        root = string(seriesIUIDRoot, 0, 20).c_str();
+    } else {
+       root = "1.2.840.113619.2.156";
+    }
+    char newuid[65];
+    dcmGenerateUniqueIdentifier(newuid, root);
+    WRITE_DCM_STRING(key, newuid);
+
+    /* clean up the char[] we created for snprintf */
+    delete[] buf;
+
+    GadgetContainerMessage<DcmFileFormat>* mdcm = new GadgetContainerMessage<DcmFileFormat>();
+
+    *mdcm->getObjectPtr() = dcmFile;
+
+    GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+        new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+    mb->getObjectPtr()->id = GADGET_MESSAGE_DICOM;
+
+    mb->cont(mdcm);
+
+    int ret =  this->controller_->output_ready(mb);
+
+    //GADGET_DEBUG1("Finished Finishing DICOM\n");
+
+    if ( (ret < 0) ) {
+        GADGET_DEBUG1("Failed to return message to controller\n");
+        return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+}
+
+
+//Declare factories for the various template instances
+GADGET_FACTORY_DECLARE(DicomFinishGadgetFLOAT)
+GADGET_FACTORY_DECLARE(DicomFinishGadgetUSHORT)
+//GADGET_FACTORY_DECLARE(DicomFinishGadgetCPLX)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/dicom/DicomFinishGadget.h b/gadgets/dicom/DicomFinishGadget.h
new file mode 100644
index 0000000..f03540c
--- /dev/null
+++ b/gadgets/dicom/DicomFinishGadget.h
@@ -0,0 +1,66 @@
+#ifndef DICOMFINISHGADGET_H
+#define DICOMFINISHGADGET_H
+
+#include "gadgetron_dicom_export.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd.h"
+#include "GadgetStreamController.h"
+
+#include <string>
+#include <map>
+#include <complex>
+
+
+namespace Gadgetron {
+
+template <typename T>
+class EXPORTGADGETSDICOM DicomFinishGadget :
+    public Gadget2<ISMRMRD::ImageHeader, hoNDArray< T > >
+{
+    public:
+        DicomFinishGadget<T>()
+            : Gadget2<ISMRMRD::ImageHeader, hoNDArray<T> >()
+            , dcmFile()
+            , seriesIUIDRoot()
+        { }
+
+    protected:
+        virtual int process_config(ACE_Message_Block * mb);
+        virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+                GadgetContainerMessage< hoNDArray< T > >* m2);
+
+    private:
+        DcmFileFormat dcmFile;
+        std::string seriesIUIDRoot;
+        long initialSeriesNumber;
+        std::map <unsigned int, std::string> seriesIUIDs;
+};
+
+class EXPORTGADGETSDICOM DicomFinishGadgetUSHORT :
+    public DicomFinishGadget<ACE_UINT16>
+{
+    public:
+        GADGET_DECLARE(DicomFinishGadgetUSHORT);
+};
+
+class EXPORTGADGETSDICOM DicomFinishGadgetFLOAT :
+    public DicomFinishGadget<float>
+{
+    public:
+        GADGET_DECLARE(DicomFinishGadgetFLOAT);
+};
+
+/*
+class EXPORTGADGETSDICOM DicomFinishGadgetCPLX :
+    public DicomFinishGadget< std::complex<float> >
+{
+    public:
+        GADGET_DECLARE(DicomFinishGadgetCPLX);
+};
+*/
+
+} /* namespace Gadgetron */
+
+#endif //DICOMFINISHGADGET_H
diff --git a/gadgets/dicom/DicomImageWriter.cpp b/gadgets/dicom/DicomImageWriter.cpp
new file mode 100644
index 0000000..0009082
--- /dev/null
+++ b/gadgets/dicom/DicomImageWriter.cpp
@@ -0,0 +1,105 @@
+#include <complex>
+#include <fstream>
+#include <time.h>
+
+// Gadgetron includes
+#include "GadgetIsmrmrdReadWrite.h"
+#include "DicomImageWriter.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+
+// DCMTK includes
+#include "dcmtk/config/osconfig.h"
+#include "dcmtk/ofstd/ofstdinc.h"
+#define INCLUDE_CSTDLIB
+#define INCLUDE_CSTDIO
+#define INCLUDE_CSTRING
+#include "dcmtk/dcmdata/dctk.h"
+#include "dcmtk/dcmdata/dcostrmb.h"
+
+
+namespace Gadgetron {
+
+int DicomImageWriter::write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+{
+    GadgetContainerMessage<DcmFileFormat>* dcm_file_message = AsContainerMessage<DcmFileFormat>(mb);
+    if (!dcm_file_message) {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), DicomImageWriter::write, invalid image message objects, 1\n")) );
+        return -1;
+    }
+
+    DcmFileFormat *dcmFile = dcm_file_message->getObjectPtr();
+
+/* BEGIN DEBUG
+
+    OFString modality;
+    DcmTagKey key(0x0008, 0x0060);
+    OFCondition s = dcmFile->getDataset()->findAndGetOFString(key, modality);
+    if (s.bad()) {
+        GADGET_DEBUG1("Failed to set Modality\n");
+        return GADGET_FAIL;
+    }
+
+    GADGET_DEBUG2("Verifying that DcmDataset is valid... Modality: %s\n", modality.c_str());
+
+END DEBUG */
+
+    //GADGET_DEBUG1("Initializing transfer state for DICOM file\n");
+    // Initialize transfer state of DcmDataset
+    dcmFile->transferInit();
+
+    // Calculate size of DcmFileFormat and create a SUFFICIENTLY sized buffer
+    long buffer_length = dcmFile->calcElementLength(EXS_LittleEndianExplicit, EET_ExplicitLength) * 2;
+    char buffer[buffer_length];
+
+    DcmOutputBufferStream out_stream(buffer, buffer_length);
+
+    OFCondition status;
+
+    status = dcmFile->write(out_stream, EXS_LittleEndianExplicit, EET_ExplicitLength, NULL);
+    if (!status.good()) {
+        GADGET_DEBUG2("Failed to write DcmFileFormat to DcmOutputStream(%s)\n", status.text());
+        return GADGET_FAIL;
+    }
+
+    void *serialized = NULL;
+    offile_off_t serialized_length = 0;
+    out_stream.flushBuffer(serialized, serialized_length);
+
+    // finalize transfer state of DcmDataset
+    dcmFile->transferEnd();
+
+    ssize_t send_cnt = 0;
+
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_DICOM;
+    //GADGET_DEBUG2("Sending GadgetMessageIdentifier %d\n", id.id);
+    if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send DICOM message identifier\n")));
+        return -1;
+    }
+    //GADGET_DEBUG2("Sent GadgetMessageIdentifier %d\n", id.id);
+
+
+    uint32_t nbytes = (uint32_t)serialized_length;
+    //GADGET_DEBUG2("Sending bytes length %d\n", serialized_length);
+    if ((send_cnt = sock->send_n (&nbytes, sizeof(nbytes))) <= 0) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send DICOM bytes length\n")));
+        return -1;
+    }
+    //GADGET_DEBUG2("Sent bytes length %d\n", serialized_length);
+
+
+    //GADGET_DEBUG1("Begin sending DICOM image bytes\n");
+    if ((send_cnt = sock->send_n (serialized, serialized_length)) <= 0) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send DICOM bytes\n")));
+        return -1;
+    }
+    //GADGET_DEBUG1("Finished sending DICOM image bytes\n");
+
+    return 0;
+}
+
+GADGETRON_WRITER_FACTORY_DECLARE(DicomImageWriter)
+
+} /* namespace Gadgetron */
diff --git a/gadgets/dicom/DicomImageWriter.h b/gadgets/dicom/DicomImageWriter.h
new file mode 100644
index 0000000..3a5c275
--- /dev/null
+++ b/gadgets/dicom/DicomImageWriter.h
@@ -0,0 +1,22 @@
+#ifndef DICOMIMAGEWRITER_H
+#define DICOMIMAGEWRITER_H
+
+#include "gadgetron_dicom_export.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd.h"
+
+
+namespace Gadgetron {
+
+class EXPORTGADGETSDICOM DicomImageWriter : public GadgetMessageWriter
+{
+ public:
+  virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb);
+
+  GADGETRON_WRITER_DECLARE(DicomImageWriter);
+};
+
+} /* namespace Gadgetron */
+
+#endif
diff --git a/gadgets/dicom/dicom.xml b/gadgets/dicom/dicom.xml
new file mode 100644
index 0000000..8d7f4d8
--- /dev/null
+++ b/gadgets/dicom/dicom.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+    xmlns="http://gadgetron.sf.net/gadgetron"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetroncore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+        <slot>1012</slot>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomImageWriter</classname>
+    </writer>
+
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetroncore</dll>
+        <classname>AccumulatorGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>FFT</name>
+        <dll>gadgetroncore</dll>
+        <classname>FFTGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>CoilCombinePython</name>
+        <dll>gadgetronpython</dll>
+        <classname>ImagePythonGadget</classname>
+        <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+        <property><name>python_module</name>                <value>rms_coil_combine</value></property>
+        <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+        <property><name>input_function</name>               <value>recon_function</value></property>
+        <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <gadget>
+        <name>Extract</name>
+        <dll>gadgetroncore</dll>
+        <classname>ExtractGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>AutoScale</name>
+        <dll>gadgetroncore</dll>
+        <classname>AutoScaleGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>DicomFinishFLOAT</name>
+        <dll>gadgetron_dicom</dll>
+        <classname>DicomFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/dicom/gadgetron_dicom_export.h b/gadgets/dicom/gadgetron_dicom_export.h
new file mode 100644
index 0000000..4f4a544
--- /dev/null
+++ b/gadgets/dicom/gadgetron_dicom_export.h
@@ -0,0 +1,15 @@
+#ifndef DICOM_EXPORT_H_
+#define DICOM_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (gadgetron_dicom_EXPORTS)
+#define EXPORTGADGETSDICOM __declspec(dllexport)
+#else
+#define EXPORTGADGETSDICOM __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSDICOM
+#endif
+
+#endif /* DICOM_EXPORT_H_ */
diff --git a/gadgets/grappa/CMakeLists.txt b/gadgets/grappa/CMakeLists.txt
new file mode 100644
index 0000000..d242c5f
--- /dev/null
+++ b/gadgets/grappa/CMakeLists.txt
@@ -0,0 +1,35 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GRAPPA__)
+endif (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+)
+
+add_library(gadgetron_grappa SHARED 
+	GrappaGadget.cpp
+	GrappaCalibrationBuffer.cpp
+	GrappaWeights.cpp
+	GrappaWeightsCalculator.cpp
+	GrappaUnmixingGadget.cpp
+	${ISMRMRD_XSD_SOURCE}
+    )
+
+target_link_libraries(gadgetron_grappa 
+  gpuparallelmri cpucore
+  ${Boost_LIBRARIES}
+  ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+  ${XERCESC_LIBRARIES} 
+  )
+
+install (TARGETS gadgetron_grappa DESTINATION lib)
+
+add_subdirectory(config)
diff --git a/gadgets/grappa/GrappaCalibrationBuffer.cpp b/gadgets/grappa/GrappaCalibrationBuffer.cpp
new file mode 100644
index 0000000..d102cf6
--- /dev/null
+++ b/gadgets/grappa/GrappaCalibrationBuffer.cpp
@@ -0,0 +1,140 @@
+#include "GrappaCalibrationBuffer.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "Gadgetron.h"
+
+namespace Gadgetron{
+
+  GrappaCalibrationBuffer::GrappaCalibrationBuffer(std::vector<size_t> dimensions,
+                                                   boost::shared_ptr<GrappaWeights<float> > w,
+                                                   GrappaWeightsCalculator<float>* weights_calculator)
+    : weights_(w)
+    , weights_calculator_(weights_calculator)
+    , buffer_counter_(dimensions[1])
+    , biggest_gap_current_(0)
+    , acceleration_factor_(0)
+    , last_line_(0)
+    , weights_invalid_(true)
+  {
+    dimensions_ = dimensions;
+    try {buffer_.create(&dimensions_);}
+    catch (std::runtime_error & err){
+      GADGET_DEBUG_EXCEPTION(err,"Unable to allocate memory for GRAPPA buffer");
+    }
+  
+  }
+
+  int GrappaCalibrationBuffer::add_data(ISMRMRD::AcquisitionHeader* m1, hoNDArray< std::complex<float> >* m2)
+  {
+    if (!buffer_.get_data_ptr()) {
+      GADGET_DEBUG1("Buffer not allocated, cannot add data");
+      return GADGET_FAIL;
+    }
+  
+    unsigned int samples =  m1->number_of_samples;
+    unsigned int line = m1->idx.kspace_encode_step_1;
+    unsigned int partition = m1->idx.kspace_encode_step_2;
+    unsigned int slice = m1->idx.slice; //We should probably check this
+
+    if (samples != dimensions_[0]) {
+      GADGET_DEBUG1("Wrong number of samples received\n");
+      return GADGET_FAIL;    
+    }
+
+    std::complex<float>* b = buffer_.get_data_ptr();
+    std::complex<float>* d = m2->get_data_ptr();
+
+    size_t offset= 0;
+    //Copy the data for all the channels
+    for (int c = 0; c < m1->active_channels; c++) {
+      offset = 
+        c*dimensions_[0]*dimensions_[1]*dimensions_[2] +
+        partition*dimensions_[0]*dimensions_[1] +
+        line*dimensions_[0];
+      memcpy(b+offset,d+c*samples,sizeof(std::complex<float>)*samples);
+    }
+
+    int buf_update  = buffer_counter_.update_line(line, m1->position,
+                                                  m1->read_dir, m1->phase_dir, m1->slice_dir);
+
+    if ( buf_update < 0) {
+      GADGET_DEBUG2("Unable to update buffer counter for line %d\n", line);
+      return GADGET_FAIL;
+    }
+
+    //Let's figure out if we should start a weight calculation job
+  
+    //This means that the orientation changed
+    if (buf_update == 1) {
+      weights_invalid_ = true;
+    }
+
+    bool is_first_scan_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_FIRST_IN_SLICE).isSet(m1->flags);
+
+
+    //Depending on the sequence used, we could get into trouble if the sequence switches slice acquisition scheme before finishing a slice.
+    bool acquiring_sequentially = line > last_line_;
+
+    if (is_first_scan_in_slice) {
+      biggest_gap_current_ = 0;
+    } else if (acquiring_sequentially){
+      unsigned int gap = std::abs(static_cast<int>(last_line_) - static_cast<int>(line));
+      if (gap != biggest_gap_current_) biggest_gap_current_ = gap;
+    } else {
+      biggest_gap_current_ = 0;
+    }
+    last_line_ = line;
+
+
+    bool is_last_scan_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->flags);
+
+    if (is_last_scan_in_slice && acquiring_sequentially) {
+      unsigned int min_ky, max_ky;
+
+      if (biggest_gap_current_ != acceleration_factor_) {
+        acceleration_factor_ = biggest_gap_current_;
+        weights_invalid_ = true;
+      }
+ 
+      if (buffer_counter_.get_region_of_support(min_ky, max_ky) < 0) {
+        GADGET_DEBUG1("Unable to query min_ky, max_ky\n");
+        return GADGET_FAIL;
+      }
+    
+      //If there is nothing on the queue, we might as well recalculate
+      if (weights_calculator_->msg_queue()->message_count() < 1) {
+        //GADGET_DEBUG1("Queue is empty, invalidating weights\n");
+        weights_invalid_ = true;
+      } else {
+        //GADGET_DEBUG1("Queue is NOT EMPTY, calculation not triggered\n");
+      }
+
+      if (weights_invalid_ && ((max_ky-min_ky) > acceleration_factor_)) {
+        std::vector< std::pair<unsigned int, unsigned int> > sampled_region;
+        sampled_region.push_back(std::pair<unsigned int, unsigned int>(0, samples-1));
+        sampled_region.push_back(std::pair<unsigned int, unsigned int>(min_ky, max_ky));
+
+        std::vector<unsigned int> uncombined_channel_weights;
+
+        //GADGET_DEBUG2("sampled_region[0] = %d,%d\n", sampled_region[0].first, sampled_region[0].second);
+        //GADGET_DEBUG2("sampled_region[1] = %d,%d\n", sampled_region[1].first, sampled_region[1].second);
+
+        if (!weights_calculator_) {
+          GADGET_DEBUG1("Weights calculator not defined\n");
+          return GADGET_FAIL;
+        }
+
+        weights_calculator_->add_job( &buffer_,
+                                      sampled_region,
+                                      acceleration_factor_,
+                                      weights_,
+                                      uncombined_channel_weights,
+                                      true);
+
+        weights_invalid_ = false;
+      }
+    }
+
+
+    return GADGET_OK;
+  }
+}
diff --git a/gadgets/grappa/GrappaCalibrationBuffer.h b/gadgets/grappa/GrappaCalibrationBuffer.h
new file mode 100644
index 0000000..ea69b21
--- /dev/null
+++ b/gadgets/grappa/GrappaCalibrationBuffer.h
@@ -0,0 +1,149 @@
+#ifndef GRAPPACALIBRATIONBUFFER_H
+#define GRAPPACALIBRATIONBUFFER_H
+
+#include "gadgetron_grappa_export.h"
+#include "ismrmrd.h"
+#include "hoNDArray.h"
+#include "GrappaWeights.h"
+#include "GrappaWeightsCalculator.h"
+
+#include <vector>
+#include <string.h>
+#include <memory>
+#include <complex>
+
+namespace Gadgetron{
+
+class EXPORTGADGETSGRAPPA CalibrationBufferCounter
+{
+
+ public:
+  CalibrationBufferCounter(unsigned int lines)  {
+    lines_sampled_ = std::vector<unsigned int>(lines,0);
+    memset(position_, 0, 3*sizeof(float));
+    memset(read_dir_, 0, 3*sizeof(float));
+    memset(phase_dir_, 0, 3*sizeof(float));
+    memset(slice_dir_, 0, 3*sizeof(float));
+  }
+
+
+  virtual ~CalibrationBufferCounter() {}
+
+  int update_line(unsigned int ky_index, float* position,
+        float* read_dir, float* phase_dir, float* slice_dir)
+  {
+    int ret_val = 0;
+
+    if (!read_dir_equal(read_dir) || 
+                !phase_dir_equal(phase_dir) ||
+                !slice_dir_equal(slice_dir) ||
+                !position_equal(position)) {
+      for (unsigned int i = 0; i < lines_sampled_.size(); i++) {
+	lines_sampled_[i] = 0;
+      }
+      memcpy(position_,position,3*sizeof(float));
+      memcpy(read_dir_,read_dir,3*sizeof(float));
+      memcpy(phase_dir_,phase_dir,3*sizeof(float));
+      memcpy(slice_dir_,slice_dir,3*sizeof(float));
+      ret_val = 1;
+    }
+
+    if (ky_index >= lines_sampled_.size()) {
+      return -1;
+    }
+
+    lines_sampled_[ky_index] = 1;
+
+    return ret_val;
+  }
+
+  int get_region_of_support(unsigned int& min_ky_index, unsigned int& max_ky_index) {
+    
+    unsigned int current_start_line = 0;
+    min_ky_index = 0;
+    max_ky_index = 0;
+    while (current_start_line < lines_sampled_.size() ) {
+      while ((current_start_line < lines_sampled_.size()) && (lines_sampled_[current_start_line] == 0) ) {	
+       	current_start_line++;
+      }
+      if (current_start_line >= lines_sampled_.size()) continue;
+
+      unsigned int region_start = current_start_line;
+      while ((current_start_line < lines_sampled_.size()) && (lines_sampled_[current_start_line] > 0)) {	
+       	current_start_line++;
+      }
+      unsigned int region_end = current_start_line-1;
+      if ((region_start < region_end) && ((region_end-region_start) > (max_ky_index-min_ky_index))) {
+	min_ky_index = region_start;
+	max_ky_index = region_end;
+      }
+    }
+    return 0;
+  }
+
+ protected:
+  float           position_[3];
+  float           read_dir_[3];
+  float           phase_dir_[3];
+  float           slice_dir_[3];
+
+  bool position_equal(float* position) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (position_[i] != position[i]) return false;
+    }
+    return true;
+  }
+
+  bool read_dir_equal(float* cosines) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (read_dir_[i] != cosines[i]) return false;
+    }
+    return true;
+  }
+
+  bool phase_dir_equal(float* cosines) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (phase_dir_[i] != cosines[i]) return false;
+    }
+    return true;
+  }
+
+  bool slice_dir_equal(float* cosines) {
+    for (unsigned int i = 0; i < 3; i++) {
+      if (slice_dir_[i] != cosines[i]) return false;
+    }
+    return true;
+  }
+
+ private:
+  std::vector<unsigned int> lines_sampled_;
+
+};
+
+class EXPORTGADGETSGRAPPA GrappaCalibrationBuffer
+{
+
+ public:
+  GrappaCalibrationBuffer(std::vector<size_t> dimensions, 
+			  boost::shared_ptr< GrappaWeights<float> > w,
+			  GrappaWeightsCalculator<float>* weights_calculator);
+  virtual ~GrappaCalibrationBuffer() {}
+
+  int add_data(ISMRMRD::AcquisitionHeader* m1, hoNDArray< std::complex<float> >* m2);
+
+ private:
+  hoNDArray< std::complex<float> > buffer_;
+  std::vector<size_t> dimensions_;
+  boost::shared_ptr< GrappaWeights<float> > weights_;
+  GrappaWeightsCalculator<float>* weights_calculator_;
+  CalibrationBufferCounter buffer_counter_;
+
+  unsigned int biggest_gap_current_;
+  unsigned int acceleration_factor_;
+  unsigned int last_line_;
+  bool weights_invalid_;
+};
+
+}
+
+#endif
diff --git a/gadgets/grappa/GrappaGadget.cpp b/gadgets/grappa/GrappaGadget.cpp
new file mode 100644
index 0000000..5d9193e
--- /dev/null
+++ b/gadgets/grappa/GrappaGadget.cpp
@@ -0,0 +1,362 @@
+#include "../mri_core/GadgetIsmrmrdReadWrite.h"
+#include "Gadgetron.h"
+#include "GrappaGadget.h"
+#include "GrappaUnmixingGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include <ace/OS_NS_stdlib.h>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+namespace Gadgetron{
+
+  GrappaGadget::GrappaGadget()
+    : image_counter_(0)
+    , image_series_(0)
+    , first_call_(true)
+    , target_coils_(0)
+  {
+  }
+
+  GrappaGadget::~GrappaGadget()
+  {
+    for (unsigned int i = 0; i < buffers_.size(); i++) {
+      if (buffers_[i]) delete buffers_[i];
+      buffers_[i] = 0;
+
+
+      if (image_data_[i]) {
+        image_data_[i]->release();
+        image_data_[i] = 0;
+      }
+    }
+  }
+
+  int GrappaGadget::close(unsigned long flags) {
+    int ret = Gadget::close(flags);
+    GADGET_DEBUG1("Shutting down GRAPPA Gadget\n");
+
+    if (weights_calculator_.close(flags) < 0) {
+      GADGET_DEBUG1("Failed to close down weights calculator\n");
+      return GADGET_FAIL;
+    }
+
+    return ret;
+  }
+
+  int GrappaGadget::process_config(ACE_Message_Block* mb)
+  {
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    unsigned int slices = e_limits.slice().present() ? e_limits.slice().get().maximum() + 1 : 1;
+    dimensions_.push_back(e_space.matrixSize().x());
+    dimensions_.push_back(e_space.matrixSize().y());
+    dimensions_.push_back(e_space.matrixSize().z());
+    dimensions_.push_back((cfg->acquisitionSystemInformation().present() && cfg->acquisitionSystemInformation().get().receiverChannels().present()) ?
+                          cfg->acquisitionSystemInformation().get().receiverChannels().get() : 1);
+    dimensions_.push_back(slices);
+
+    fov_.push_back(r_space.fieldOfView_mm().x());
+    fov_.push_back(r_space.fieldOfView_mm().y());
+    fov_.push_back(r_space.fieldOfView_mm().z());
+
+    line_offset_ = (dimensions_[1]>>1)-e_limits.kspace_encoding_step_1().get().center();
+
+    return GADGET_OK;
+  }
+
+
+  int GrappaGadget::initial_setup()
+  {
+
+    GADGET_DEBUG2("Dimensions %d, %d, %d, %d, %d\n", dimensions_[0], dimensions_[1], dimensions_[2], dimensions_[3], dimensions_[4]);
+
+    image_dimensions_.push_back(dimensions_[0] / 2); //TODO: fix this in general
+    image_dimensions_.push_back(dimensions_[1]);
+    image_dimensions_.push_back(dimensions_[2]);
+    image_dimensions_.push_back(dimensions_[3]);
+
+
+    weights_ = std::vector< boost::shared_ptr<GrappaWeights<float> > >(dimensions_[4]);
+
+    buffers_ = std::vector<GrappaCalibrationBuffer* >(dimensions_[4],0);
+    time_stamps_ = std::vector<ACE_UINT32>(dimensions_[4],0);
+
+    //Let's figure out the number of target coils
+    target_coils_ = this->get_int_value("target_coils");
+    if ((target_coils_ <= 0) || (target_coils_ > dimensions_[3])) {
+      target_coils_ = dimensions_[3];
+    }
+
+    GADGET_DEBUG2("Running GRAPPA recon with %d source channels and %d target channels\n", dimensions_[3], target_coils_);
+
+    weights_calculator_.set_number_of_target_coils(target_coils_);
+
+    //Let's figure out if we have channels that are supposed to be uncombined
+    boost::shared_ptr<std::string> uncomb_str = this->get_string_value("uncombined_channels");
+    std::vector<std::string> uncomb;
+    boost::split(uncomb, *uncomb_str, boost::is_any_of(","));
+    for (unsigned int i = 0; i < uncomb.size(); i++) {
+      std::string ch = boost::algorithm::trim_copy(uncomb[i]);
+      if (ch.size() > 0) {
+        unsigned int channel_id = static_cast<unsigned int>(ACE_OS::atoi(ch.c_str()));
+        weights_calculator_.add_uncombined_channel(channel_id);
+      }
+    }
+
+    for (unsigned int i = 0; i < buffers_.size(); i++) {
+      weights_[i] = boost::shared_ptr<GrappaWeights<float> >(new GrappaWeights<float>());
+
+      //Let's set some default GRAPPA weights, so that we have something to work with the first couple of frames.
+      /*
+        std::vector<unsigned int> wdims = image_dimensions_;
+        if (weights_calculator_.get_number_of_uncombined_channels()) {
+        wdims.push_back(weights_calculator_.get_number_of_uncombined_channels()+1);
+        }
+
+        hoNDArray< std::complex<float> > tmp_w;
+        if (!tmp_w.create(&wdims)) {
+        GADGET_DEBUG1("Unable to create temporary array with dimensions\n");
+        return GADGET_FAIL;
+        }
+        tmp_w.clear(std::complex<float>(1.0,0));
+        weights_[i]->update(&tmp_w);
+      */
+
+      buffers_[i] = new GrappaCalibrationBuffer(image_dimensions_,
+                                                weights_[i],
+                                                &weights_calculator_);
+    }
+
+
+    if (weights_calculator_.open() < 0) {
+      GADGET_DEBUG1("Failed to open GrappaWeightsCalculator\n");
+      return GADGET_FAIL;
+    }
+
+    image_data_ = std::vector< GadgetContainerMessage< hoNDArray< std::complex<float> > >* >(dimensions_[4],0);
+    for (unsigned int i = 0; i < image_data_.size(); i++) {
+      if (create_image_buffer(i) != GADGET_OK) {
+        GADGET_DEBUG1("Unable to create image buffers");
+        return GADGET_FAIL;
+      }
+    }
+
+    image_series_ = this->get_int_value("image_series");
+
+    return GADGET_OK;
+  }
+
+
+  int GrappaGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+
+    if (first_call_) {
+      if (m1->getObjectPtr()->active_channels != dimensions_[3]) {
+        GADGET_DEBUG1("Detected coil number change. Maybe due to upstream channel reduction\n");
+        dimensions_[3] = m1->getObjectPtr()->active_channels;
+      }
+
+      if (initial_setup() != GADGET_OK) {
+        GADGET_DEBUG1("Initial Setup Failed\n");
+        m1->release();
+        return GADGET_FAIL;
+      }
+      first_call_ = false;
+    }
+
+    ISMRMRD::AcquisitionHeader* acq_head = m1->getObjectPtr();
+
+    unsigned int samples =  acq_head->number_of_samples;
+    unsigned int line = acq_head->idx.kspace_encode_step_1 + line_offset_;
+    unsigned int partition = acq_head->idx.kspace_encode_step_2;
+    unsigned int slice = acq_head->idx.slice;
+
+    if (samples != image_dimensions_[0]) {
+      GADGET_DEBUG1("GrappaGadget: wrong number of samples received\n");
+      return GADGET_FAIL;
+    }
+
+    if (slice >= image_data_.size()) {
+      GADGET_DEBUG1("Invalid slice number received\n");
+      return GADGET_FAIL;
+    }
+
+    if (!image_data_[0]) {
+      if (create_image_buffer(slice) != GADGET_OK) {
+        GADGET_DEBUG1("Failed to allocate new slice buffer\n");
+        return GADGET_FAIL;
+      }
+    }
+
+    std::complex<float>* b = image_data_[slice]->getObjectPtr()->get_data_ptr();
+    std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+
+    size_t offset= 0;
+    //Copy the data for all the channels
+    for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+      offset =
+        c*image_dimensions_[0]*image_dimensions_[1]*image_dimensions_[2] +
+        partition*image_dimensions_[0]*image_dimensions_[1] +
+        line*image_dimensions_[0];
+
+      memcpy(b+offset,d+c*samples,sizeof(std::complex<float>)*samples);
+    }
+
+
+    bool is_last_scan_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+    bool is_first_scan_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_FIRST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+    if (is_first_scan_in_slice) {
+      time_stamps_[slice] = m1->getObjectPtr()->acquisition_time_stamp;
+    }
+
+    if (is_last_scan_in_slice) {
+
+      GadgetContainerMessage<GrappaUnmixingJob>* cm0 =
+        new GadgetContainerMessage<GrappaUnmixingJob>();
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 =
+        new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+
+      /*
+        GadgetContainerMessage< hoNDArray<std::complex<float> > >* cm2 =
+        new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+        std::vector<unsigned int> combined_dims(3,0);
+        combined_dims[0] = image_dimensions_[0];
+        combined_dims[1] = image_dimensions_[1];
+        combined_dims[2] = image_dimensions_[2];
+
+        if (weights_calculator_.get_number_of_uncombined_channels()) {
+        combined_dims.push_back(weights_calculator_.get_number_of_uncombined_channels()+1);
+        }
+
+        if (!cm2->getObjectPtr()->create(&combined_dims)) {
+        GADGET_DEBUG1("Unable to create combined image array\n");
+        return GADGET_FAIL;
+        }
+
+        cm1->cont(cm2);
+      */
+
+      cm1->getObjectPtr()->matrix_size[0] = image_dimensions_[0];
+      cm1->getObjectPtr()->matrix_size[1] = image_dimensions_[1];
+      cm1->getObjectPtr()->matrix_size[2] = image_dimensions_[2];
+
+      cm1->getObjectPtr()->field_of_view[0] = fov_[0];
+      cm1->getObjectPtr()->field_of_view[1] = fov_[1];
+      cm1->getObjectPtr()->field_of_view[2] = fov_[2];
+
+      cm1->getObjectPtr()->channels       = 1+weights_calculator_.get_number_of_uncombined_channels();
+      cm1->getObjectPtr()->slice              = m1->getObjectPtr()->idx.slice;
+      cm1->getObjectPtr()->acquisition_time_stamp         = time_stamps_[slice];
+
+      memcpy(cm1->getObjectPtr()->position,m1->getObjectPtr()->position,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->read_dir,m1->getObjectPtr()->read_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->phase_dir,m1->getObjectPtr()->phase_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->slice_dir,m1->getObjectPtr()->slice_dir,
+             sizeof(float)*3);
+
+      memcpy(cm1->getObjectPtr()->patient_table_position,m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+      cm1->getObjectPtr()->image_index = ++image_counter_;
+      cm1->getObjectPtr()->image_series_index = image_series_;
+
+
+      cm0->getObjectPtr()->weights_ = weights_[slice];
+      cm0->cont(cm1);
+      cm1->cont(image_data_[slice]);
+
+      image_data_[slice] = 0;
+      if (create_image_buffer(slice) != GADGET_OK) {
+        GADGET_DEBUG1("Failed to create image buffer");
+        return GADGET_FAIL;
+      }
+
+      if (this->next()->putq(cm0) < 0) {
+        GADGET_DEBUG1("Failed to pass image on to next Gadget in chain\n");
+        return GADGET_FAIL;
+      }
+
+      /*
+        hoFFT<float>::instance()->ifft(image_data_[slice]->getObjectPtr(),0);
+        hoFFT<float>::instance()->ifft(image_data_[slice]->getObjectPtr(),1);
+        hoFFT<float>::instance()->ifft(image_data_[slice]->getObjectPtr(),2);
+
+        //apply weights
+        float scale_factor = (dimensions_[0] *dimensions_[1] *dimensions_[0] *dimensions_[1])/10;
+
+        int appl_result = weights_[slice]->apply(image_data_[slice]->getObjectPtr(), cm2->getObjectPtr(), scale_factor);
+        if (appl_result < 0) {
+        GADGET_DEBUG2("Failed to apply GRAPPA weights: error code %d\n", appl_result);
+        return GADGET_FAIL;
+        }
+
+        if (this->next()->putq(cm1) < 0) {
+        GADGET_DEBUG1("Failed to pass image on to next Gadget in chain\n");
+        return GADGET_FAIL;
+        }
+        image_data_[slice]->getObjectPtr()->clear(std::complex<float>(0.0f,0.0f));
+      */
+    }
+
+    if (buffers_[slice]->add_data(m1->getObjectPtr(),m2->getObjectPtr()) < 0) {
+      GADGET_DEBUG1("Failed to add incoming data to grappa calibration buffer\n");
+      return GADGET_FAIL;
+    }
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+
+  int GrappaGadget::create_image_buffer(unsigned int slice)
+  {
+    if (slice >= image_data_.size()) {
+      return GADGET_FAIL;
+    }
+
+    if (image_data_[slice] != 0) {
+      image_data_[slice]->release();
+      image_data_[slice] = 0;
+    }
+
+    image_data_[slice] = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    try{ image_data_[slice]->getObjectPtr()->create(&image_dimensions_);}
+    catch (std::runtime_error &err){
+      GADGET_DEBUG_EXCEPTION(err,"Unable to create image buffers");
+      return GADGET_FAIL;
+    }
+
+    std::fill(image_data_[slice]->getObjectPtr()->get_data_ptr(),
+              image_data_[slice]->getObjectPtr()->get_data_ptr()+image_data_[slice]->getObjectPtr()->get_number_of_elements(),
+              std::complex<float>(0.0f,0.0f));
+
+    return GADGET_OK;
+
+  }
+
+  GADGET_FACTORY_DECLARE(GrappaGadget)
+}
diff --git a/gadgets/grappa/GrappaGadget.h b/gadgets/grappa/GrappaGadget.h
new file mode 100644
index 0000000..64d8160
--- /dev/null
+++ b/gadgets/grappa/GrappaGadget.h
@@ -0,0 +1,62 @@
+#ifndef GRAPPAGADGET_H
+#define GRAPPAGADGET_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "GrappaCalibrationBuffer.h"
+#include "gadgetron_grappa_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+struct EXPORTGADGETSGRAPPA GrappaBufferInfo
+{
+  float           position[3];
+  float           read_dir[3];
+  float           phase_dir[3];
+  float           slice_dir[3];
+  unsigned int    acceleration_factor;
+};
+
+class EXPORTGADGETSGRAPPA GrappaGadget : 
+public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+  
+ public:
+  GADGET_DECLARE(GrappaGadget);
+
+  GrappaGadget();
+  virtual ~GrappaGadget();
+
+ protected:
+  virtual int process_config(ACE_Message_Block* mb);
+  virtual int process( GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+		  GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2 );
+
+  virtual int create_image_buffer(unsigned int slice);
+
+  //We have to overwrite close in this gadget to make sure we wait for the weights calculator.
+  virtual int close(unsigned long flags);
+
+  virtual int initial_setup();
+
+  bool first_call_;
+ private:
+  std::vector< GrappaCalibrationBuffer* > buffers_;
+  std::vector<unsigned int> fov_;
+  std::vector<size_t> dimensions_;
+  std::vector<size_t> image_dimensions_;
+  std::vector< GadgetContainerMessage<  hoNDArray< std::complex<float> > >* > image_data_;
+  std::vector< boost::shared_ptr<GrappaWeights<float> > > weights_;
+  GrappaWeightsCalculator<float> weights_calculator_;
+  std::vector<ACE_UINT32> time_stamps_;
+  int image_counter_;
+  int image_series_;
+  int target_coils_;
+  float phase_encoding_resolution_;
+  unsigned int line_offset_;
+};
+}
+#endif //GRAPPAGADGET_H
diff --git a/gadgets/grappa/GrappaUnmixingGadget.cpp b/gadgets/grappa/GrappaUnmixingGadget.cpp
new file mode 100644
index 0000000..28853af
--- /dev/null
+++ b/gadgets/grappa/GrappaUnmixingGadget.cpp
@@ -0,0 +1,67 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GrappaUnmixingGadget.h"
+#include "hoNDFFT.h"
+
+namespace Gadgetron{
+
+  GrappaUnmixingGadget::GrappaUnmixingGadget() {
+    // TODO Auto-generated constructor stub
+
+  }
+
+  GrappaUnmixingGadget::~GrappaUnmixingGadget() {
+    // TODO Auto-generated destructor stub
+  }
+
+  int GrappaUnmixingGadget::process(GadgetContainerMessage<GrappaUnmixingJob>* m1,
+                                    GadgetContainerMessage<ISMRMRD::ImageHeader>* m2, GadgetContainerMessage<hoNDArray<std::complex<float> > >* m3)
+  {
+    GadgetContainerMessage< hoNDArray<std::complex<float> > >* cm2 =
+			new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+    std::vector<size_t> combined_dims(3,0);
+    combined_dims[0] = m2->getObjectPtr()->matrix_size[0];
+    combined_dims[1] = m2->getObjectPtr()->matrix_size[1];
+    combined_dims[2] = m2->getObjectPtr()->matrix_size[2];
+
+    if (m2->getObjectPtr()->channels > 1) {
+      combined_dims.push_back(m2->getObjectPtr()->channels);
+    }
+
+    try{cm2->getObjectPtr()->create(&combined_dims);}
+    catch (std::runtime_error &err ){
+      GADGET_DEBUG_EXCEPTION(err,"Unable to create combined image array\n");
+      return GADGET_FAIL;
+    }
+
+    m1->cont(0);
+    m2->cont(cm2);
+
+    hoNDFFT<float>::instance()->ifft(m3->getObjectPtr(),0);
+    hoNDFFT<float>::instance()->ifft(m3->getObjectPtr(),1);
+    hoNDFFT<float>::instance()->ifft(m3->getObjectPtr(),2);
+
+    if (!m1->getObjectPtr()->weights_) {
+      GADGET_DEBUG1("Weights are a NULL\n");
+      return GADGET_FAIL;
+    }
+
+    float scale_factor = 1.0;
+    int appl_result = m1->getObjectPtr()->weights_->apply(m3->getObjectPtr(), cm2->getObjectPtr(), scale_factor);
+    if (appl_result < 0) {
+      GADGET_DEBUG2("Failed to apply GRAPPA weights: error code %d\n", appl_result);
+      return GADGET_FAIL;
+    }
+
+    m1->release();
+    m3->release();
+
+    if (this->next()->putq(m2) < 0) {
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(GrappaUnmixingGadget)
+}
diff --git a/gadgets/grappa/GrappaUnmixingGadget.h b/gadgets/grappa/GrappaUnmixingGadget.h
new file mode 100644
index 0000000..2fe525f
--- /dev/null
+++ b/gadgets/grappa/GrappaUnmixingGadget.h
@@ -0,0 +1,32 @@
+#ifndef GRAPPAUNMIXINGGADGET_H_
+#define GRAPPAUNMIXINGGADGET_H_
+
+#include "gadgetron_grappa_export.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "GrappaWeights.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  struct EXPORTGADGETSGRAPPA GrappaUnmixingJob
+  {
+    boost::shared_ptr< GrappaWeights<float> > weights_;
+  };
+
+  class EXPORTGADGETSGRAPPA GrappaUnmixingGadget: public Gadget3<GrappaUnmixingJob, ISMRMRD::ImageHeader, hoNDArray<std::complex<float> > > {
+  public:
+    GADGET_DECLARE(GrappaUnmixingGadget);
+
+    GrappaUnmixingGadget();
+    virtual ~GrappaUnmixingGadget();
+
+  protected:
+    virtual int process(GadgetContainerMessage<GrappaUnmixingJob>* m1,
+                        GadgetContainerMessage<ISMRMRD::ImageHeader>* m2, GadgetContainerMessage<hoNDArray<std::complex<float> > >* m3);
+  };
+}
+
+#endif /* GRAPPAUNMIXINGGADGET_H_ */
diff --git a/gadgets/grappa/GrappaWeights.cpp b/gadgets/grappa/GrappaWeights.cpp
new file mode 100644
index 0000000..1a3b42d
--- /dev/null
+++ b/gadgets/grappa/GrappaWeights.cpp
@@ -0,0 +1,112 @@
+#include "../mri_core/GadgetIsmrmrdReadWrite.h"
+#include "Gadgetron.h"
+#include "GrappaWeights.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+template <class T> int GrappaWeights<T>::
+update(hoNDArray< std::complex<T> >* new_weights)
+{
+  /*
+  ACE_Guard<ACE_Thread_Mutex> guard(mutex_);
+  if (!guard.locked()) {
+    return -1;
+  }
+  */
+
+  mutex_.acquire();
+
+  if (!weights_.dimensions_equal(new_weights)) {
+    try{weights_.create(new_weights->get_dimensions());}
+    catch (std::runtime_error & err){
+      return -2;
+    }
+  }
+
+  memcpy(weights_.get_data_ptr(), new_weights->get_data_ptr(),
+	 weights_.get_number_of_elements()*sizeof(T)*2);
+
+  weights_are_valid_ = true;
+  mutex_.release();
+  cond_.broadcast();
+
+  return 0;
+}
+
+template<class T> int GrappaWeights<T>::
+apply(hoNDArray< std::complex<T> >* data_in,
+      hoNDArray< std::complex<T> >* data_out,
+      T scale)
+{
+  /*
+  ACE_Guard<ACE_Thread_Mutex> guard(mutex_);
+  if (!guard.locked()) {
+    return -1;
+  }
+  */
+
+  mutex_.acquire();
+  if (!weights_are_valid_) {
+	  GADGET_DEBUG1("Releasing Mutex to Wait for result\n");
+	  mutex_.release();
+	  cond_.wait();
+	  mutex_.acquire();
+ }
+
+
+  if (weights_.get_number_of_elements()%data_in->get_number_of_elements()) {
+    return -3;
+  }
+
+  unsigned int sets = weights_.get_number_of_elements()/data_in->get_number_of_elements();
+  
+  if (sets < 1) {
+    return -4;
+  }
+
+  if (data_out->get_size(data_out->get_number_of_dimensions()-1) != sets) {
+    return -5;
+  }
+
+  unsigned long image_elements = data_out->get_number_of_elements()/sets;
+  unsigned int coils = weights_.get_number_of_elements()/(sets*image_elements);
+  
+  if (weights_.get_number_of_elements() != (image_elements*coils*sets)) {
+    return -6;
+  }
+
+  if (data_in->get_number_of_elements() != (image_elements*coils)) {
+    return -7;
+  }
+
+  if (data_out->get_number_of_elements() != (image_elements*sets)) {
+    return -8;
+  }
+
+  std::complex<T>* weights_ptr = weights_.get_data_ptr();
+  std::complex<T>* in_ptr = data_in->get_data_ptr();
+  std::complex<T>* out_ptr = data_out->get_data_ptr();
+
+  for (unsigned int i = 0; i < image_elements*sets; i++) {
+    out_ptr[i] = 0;
+  }
+
+  for (unsigned int s = 0; s < sets; s++) {
+    for (unsigned int p = 0; p < image_elements; p++) {
+      for (unsigned int c = 0; c < coils; c++) {
+	out_ptr[s*image_elements + p] += 
+	  weights_ptr[s*image_elements*coils + c*image_elements + p] * 
+	  in_ptr[c*image_elements + p]*scale;
+      }
+    }
+  }
+
+  mutex_.release();
+  return 0;
+}
+
+//Template instanciation
+template class EXPORTGADGETSGRAPPA GrappaWeights<float>;
+template class EXPORTGADGETSGRAPPA GrappaWeights<double>;
+}
diff --git a/gadgets/grappa/GrappaWeights.h b/gadgets/grappa/GrappaWeights.h
new file mode 100644
index 0000000..e9de58f
--- /dev/null
+++ b/gadgets/grappa/GrappaWeights.h
@@ -0,0 +1,37 @@
+#pragma once 
+
+#include "gadgetron_grappa_export.h"
+#include "hoNDArray.h"
+
+#include <ace/Synch.h>
+#include <complex>
+
+namespace Gadgetron{
+
+template <class T> class EXPORTGADGETSGRAPPA GrappaWeights
+{
+ public:
+  GrappaWeights()
+  	  : weights_are_valid_(false)
+  	  , cond_(cond_mutex_)
+  	  {
+
+  	  }
+  virtual ~GrappaWeights() {}
+  
+  int update(hoNDArray< std::complex<T> >* new_weights);
+
+  int apply(hoNDArray< std::complex<T> >* data_in,
+	    hoNDArray< std::complex<T> >* data_out, 
+	    T scale = 1.0);
+
+ private:
+  ACE_Thread_Mutex mutex_;
+  bool weights_are_valid_;
+
+  ACE_Thread_Mutex cond_mutex_;
+  ACE_Condition_Thread_Mutex cond_;
+  hoNDArray< std::complex<T> > weights_;
+
+};
+}
diff --git a/gadgets/grappa/GrappaWeightsCalculator.cpp b/gadgets/grappa/GrappaWeightsCalculator.cpp
new file mode 100644
index 0000000..67b86a4
--- /dev/null
+++ b/gadgets/grappa/GrappaWeightsCalculator.cpp
@@ -0,0 +1,258 @@
+#include "cuNDFFT.h"
+#include "GrappaWeightsCalculator.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "Gadgetron.h"
+#include "b1_map.h"
+#include "hoNDArray_fileio.h"
+#include "htgrappa.h"
+#include "GadgetronTimer.h"
+#include "GPUTimer.h"
+#include "complext.h"
+
+#include <cuComplex.h>
+
+namespace Gadgetron{
+
+template <class T> class EXPORTGADGETSGRAPPA GrappaWeightsDescription
+{
+
+public:
+	std::vector< std::pair<unsigned int, unsigned int> > sampled_region;
+	unsigned int acceleration_factor;
+	boost::shared_ptr<GrappaWeights<T> > destination;
+	std::vector<unsigned int> uncombined_channel_weights;
+	bool include_uncombined_channels_in_combined_weights;
+};
+
+template <class T> int GrappaWeightsCalculator<T>::svc(void)  {
+	ACE_TRACE(( ACE_TEXT("GrappaWeightsCalculator::svc") ));
+
+	ACE_Message_Block *mb;
+
+	while (this->getq(mb) >= 0) {
+		if (mb->msg_type() == ACE_Message_Block::MB_HANGUP) {
+			GADGET_DEBUG1("Hanging up in weights calculator\n");
+			if (this->putq(mb) == -1) {
+				ACE_ERROR_RETURN( (LM_ERROR,
+						ACE_TEXT("%p\n"),
+						ACE_TEXT("GrappaWeightsCalculator::svc, putq")),
+						-1);
+			}
+			break;
+		}
+
+		GadgetContainerMessage< GrappaWeightsDescription<T> >* mb1
+		= AsContainerMessage< GrappaWeightsDescription<T> >(mb);
+
+		if (!mb1) {
+			mb->release();
+			return -2;
+		}
+
+		GadgetContainerMessage< hoNDArray< std::complex<T> > >* mb2
+		= AsContainerMessage< hoNDArray< std::complex<T> > >(mb1->cont());
+
+		if (!mb2) {
+			mb->release();
+			return -3;
+		}
+
+		hoNDArray<float_complext>* host_data =
+				reinterpret_cast< hoNDArray<float_complext>* >(mb2->getObjectPtr());
+
+		// Copy the image data to the device
+		cuNDArray<float_complext> device_data(host_data);
+		device_data.squeeze();
+
+		std::vector<size_t> ftdims(2,0); ftdims[1] = 1;
+
+		//Go to image space
+		 cuNDFFT<float>::instance()->ifft( &device_data, &ftdims);
+
+        size_t RO = device_data.get_size(0);
+        size_t E1 = device_data.get_size(1);
+        size_t CHA = device_data.get_size(2);
+
+        size_t ks = 5;
+        size_t power = 3;
+
+        cuNDArray<complext<float> > D(RO*E1, ks*ks, CHA);
+        cuNDArray<complext<float> > DH_D(RO*E1, CHA, CHA); 
+        cuNDArray<complext<float> > V1(RO*E1, CHA);
+        cuNDArray<complext<float> > U1(RO*E1, ks*ks);
+
+		// Compute CSM
+		cuNDArray<float_complext> csm;
+        csm.create(device_data.get_dimensions());
+		{
+        	//GPUTimer timer("GRAPPA CSM");
+			// csm = estimate_b1_map<float,2>( &device_data, target_coils_ );
+
+            estimate_b1_map_2D_NIH_Souheil( &device_data, &csm, ks, power, D, DH_D, V1, U1 );
+
+			//GADGET_DEBUG2("Coils in csm: %d\n", csm->get_size(2));
+		}
+		//Go back to kspace
+		cuNDFFT<float>::instance()->fft(&device_data, &ftdims);
+
+		cuNDArray<complext<float> > unmixing_dev;
+		boost::shared_ptr< std::vector<size_t> > data_dimensions = device_data.get_dimensions();
+
+		if (uncombined_channels_.size() > 0) {
+			data_dimensions->push_back(uncombined_channels_.size()+1);
+		}
+
+		try{unmixing_dev.create(data_dimensions.get());}
+		catch (std::runtime_error &err){
+			GADGET_DEBUG_EXCEPTION(err,"Unable to allocate device memory for unmixing coeffcients\n");
+			return GADGET_FAIL;
+		}
+
+		{
+			//GPUTimer unmix_timer("GRAPPA Unmixing");
+            //GadgetronTimer timer("GRAPPA unmixing", true);
+			std::vector<unsigned int> kernel_size;
+
+			//TODO: Add parameters for kernel size
+			kernel_size.push_back(5);
+			kernel_size.push_back(4);
+			if ( htgrappa_calculate_grappa_unmixing(reinterpret_cast< cuNDArray<complext<float> >* >(&device_data),
+					&csm,
+					(unsigned int)(mb1->getObjectPtr()->acceleration_factor),
+					&kernel_size,
+					&unmixing_dev,
+					&(mb1->getObjectPtr()->sampled_region),
+					&uncombined_channels_) < 0) {
+				GADGET_DEBUG1("GRAPPA unmixing coefficients calculation failed\n");
+				return GADGET_FAIL;
+			}
+		}
+
+		if (mb1->getObjectPtr()->destination) {
+			boost::shared_ptr< hoNDArray<complext<float> > > unmixing_host = unmixing_dev.to_host();
+
+			//TODO: This reshaping needs to take uncombined channels into account
+			boost::shared_ptr< std::vector<size_t> > tmp_dims = mb2->getObjectPtr()->get_dimensions();
+			if (uncombined_channels_.size()) tmp_dims->push_back( (size_t) (uncombined_channels_.size()+1) );
+
+			try {
+				unmixing_host->reshape(tmp_dims.get());
+			} catch (std::runtime_error &err){
+				GADGET_DEBUG_EXCEPTION( err, "Reshaping of GRAPPA weights failed \n" );
+
+			}
+
+			if (mb1->getObjectPtr()->destination->update(reinterpret_cast<hoNDArray<std::complex<float> >* >(unmixing_host.get())) < 0) {
+				GADGET_DEBUG1("Update of GRAPPA weights failed\n");
+				return GADGET_FAIL;
+			}
+		} else {
+			GADGET_DEBUG1("Undefined GRAPPA weights destination\n");
+			return GADGET_FAIL;
+		}
+
+
+		mb->release();
+	}
+
+	return 0;
+}
+
+template <class T> int GrappaWeightsCalculator<T>::close(unsigned long flags) {
+	ACE_TRACE(( ACE_TEXT("GrappaWeightsCalculator::close") ));
+
+	int rval = 0;
+	if (flags == 1) {
+		ACE_Message_Block *hangup = new ACE_Message_Block();
+		hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+		if (this->putq(hangup) == -1) {
+			hangup->release();
+			ACE_ERROR_RETURN( (LM_ERROR,
+					ACE_TEXT("%p\n"),
+					ACE_TEXT("GrappaWeightsCalculator::close, putq")),
+					-1);
+		}
+		//GADGET_DEBUG1("Waiting for weights calculator to finish\n");
+		rval = this->wait();
+		//GADGET_DEBUG1("Weights calculator to finished\n");
+	}
+	return rval;
+}
+
+
+template <class T> int GrappaWeightsCalculator<T>::
+add_job( hoNDArray< std::complex<T> >* ref_data,
+		std::vector< std::pair<unsigned int, unsigned int> > sampled_region,
+		unsigned int acceleration_factor,
+		boost::shared_ptr< GrappaWeights<T> > destination,
+		std::vector<unsigned int> uncombined_channel_weights,
+		bool include_uncombined_channels_in_combined_weights)
+		{
+
+	GadgetContainerMessage< GrappaWeightsDescription<T> >* mb1 =
+			new GadgetContainerMessage< GrappaWeightsDescription<T> >();
+
+	if (!mb1) {
+		return -1;
+	}
+
+	/*
+  for (unsigned int i = 0; i < sampled_region.size(); i++) {
+	  GADGET_DEBUG2("Sampled region %d: [%d, %d]\n", i, sampled_region[i].first, sampled_region[i].second);
+  }
+	 */
+
+	mb1->getObjectPtr()->sampled_region = sampled_region;
+	mb1->getObjectPtr()->acceleration_factor = acceleration_factor;
+	mb1->getObjectPtr()->destination = destination;
+	mb1->getObjectPtr()->uncombined_channel_weights = uncombined_channel_weights;
+	mb1->getObjectPtr()->include_uncombined_channels_in_combined_weights =
+			include_uncombined_channels_in_combined_weights;
+
+
+	GadgetContainerMessage< hoNDArray< std::complex<T> > >* mb2 =
+			new GadgetContainerMessage< hoNDArray< std::complex<T> > >();
+
+	if (!mb2) {
+		mb1->release();
+		return -2;
+	}
+
+	mb1->cont(mb2);
+
+	try{mb2->getObjectPtr()->create(ref_data->get_dimensions().get());}
+	catch (std::runtime_error &err ){
+		mb1->release();
+		return -3;
+	}
+
+	memcpy(mb2->getObjectPtr()->get_data_ptr(), ref_data->get_data_ptr(),
+			ref_data->get_number_of_elements()*sizeof(T)*2);
+
+	this->putq(mb1);
+
+	return 0;
+		}
+
+template <class T> int GrappaWeightsCalculator<T>::add_uncombined_channel(unsigned int channel_id)
+		{
+	remove_uncombined_channel(channel_id);
+	uncombined_channels_.push_back(channel_id);
+	return 0;
+		}
+
+template <class T> int GrappaWeightsCalculator<T>::remove_uncombined_channel(unsigned int channel_id)
+		{
+	uncombined_channels_.remove(channel_id);
+	return 0;
+		}
+
+
+
+template class EXPORTGADGETSGRAPPA GrappaWeightsDescription<float>;
+template class EXPORTGADGETSGRAPPA GrappaWeightsCalculator<float>;
+//template class EXPORTGADGETSGRAPPA GrappaWeightsCalculator<double>; //TOFO
+//template class EXPORTGADGETSGRAPPA GrappaWeightsDescription<double>;
+
+}
diff --git a/gadgets/grappa/GrappaWeightsCalculator.h b/gadgets/grappa/GrappaWeightsCalculator.h
new file mode 100644
index 0000000..3bfa7ee
--- /dev/null
+++ b/gadgets/grappa/GrappaWeightsCalculator.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include "gadgetron_grappa_export.h"
+#include "GrappaWeights.h"
+
+#include <ace/Task.h>
+#include <list>
+
+namespace Gadgetron{
+
+template <class T> class EXPORTGADGETSGRAPPA GrappaWeightsCalculator : public ACE_Task<ACE_MT_SYNCH>
+{
+  typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+ public:
+  GrappaWeightsCalculator() 
+    : inherited()
+  	, target_coils_(0)
+   {
+    ACE_TRACE(( ACE_TEXT("GrappaWeightsCalculator::GrappaWeightsCalculator") ));
+  }
+
+  virtual ~GrappaWeightsCalculator() { }
+
+  virtual int init(void)
+  {
+    ACE_TRACE(( ACE_TEXT("GrappaWeightsCalculator::init") ));
+    return 0;
+  }
+
+  virtual int open(void* = 0) 
+  {
+    ACE_TRACE(( ACE_TEXT("GrappaWeightsCalculator::open") ));
+    return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+  }
+
+  virtual int close(unsigned long flags);
+  virtual int svc(void);
+
+  virtual int add_job( hoNDArray< std::complex<T> >* ref_data,
+		       std::vector< std::pair<unsigned int, unsigned int> > sampled_region,
+		       unsigned int acceleration_factor,
+		       boost::shared_ptr<GrappaWeights<T> > destination,
+		       std::vector<unsigned int> uncombined_channel_weights,
+		       bool include_uncombined_channels_in_combined_weights = true);
+
+  virtual int add_uncombined_channel(unsigned int channel_id);
+  virtual int remove_uncombined_channel(unsigned int channel_id);
+  virtual int get_number_of_uncombined_channels() {
+    return uncombined_channels_.size();
+  }
+
+  virtual int get_number_of_target_coils() {
+	  return target_coils_;
+  }
+
+  virtual void set_number_of_target_coils(int n) {
+	  target_coils_ = n;
+  }
+
+ private:
+  std::list<unsigned int> uncombined_channels_;
+  int target_coils_;
+};
+}
diff --git a/gadgets/grappa/config/CMakeLists.txt b/gadgets/grappa/config/CMakeLists.txt
new file mode 100644
index 0000000..a06dc16
--- /dev/null
+++ b/gadgets/grappa/config/CMakeLists.txt
@@ -0,0 +1,5 @@
+install (FILES grappa_unoptimized.xml grappa_unoptimized_float.xml DESTINATION config)
+
+if(ARMADILLO_FOUND)
+  install (FILES grappa.xml grappa_float.xml DESTINATION config)
+endif(ARMADILLO_FOUND)
diff --git a/gadgets/grappa/config/grappa.xml b/gadgets/grappa/config/grappa.xml
new file mode 100644
index 0000000..84ae791
--- /dev/null
+++ b/gadgets/grappa/config/grappa.xml
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+   <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+ 
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--    
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_float.xml b/gadgets/grappa/config/grappa_float.xml
new file mode 100644
index 0000000..b18039a
--- /dev/null
+++ b/gadgets/grappa/config/grappa_float.xml
@@ -0,0 +1,118 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+   <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    -->
+    
+    <!--
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+     -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    -->
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_unoptimized.xml b/gadgets/grappa/config/grappa_unoptimized.xml
new file mode 100644
index 0000000..bb40ffd
--- /dev/null
+++ b/gadgets/grappa/config/grappa_unoptimized.xml
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+ 
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--    
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/config/grappa_unoptimized_float.xml b/gadgets/grappa/config/grappa_unoptimized_float.xml
new file mode 100644
index 0000000..002d5db
--- /dev/null
+++ b/gadgets/grappa/config/grappa_unoptimized_float.xml
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+     <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>RemoveROOversampling</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Grappa</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaGadget</classname>
+      <property><name>target_coils</name><value>8</value></property>
+    </gadget>
+
+    <gadget>
+      <name>GrappaUnmixing</name>
+      <dll>gadgetron_grappa</dll>
+      <classname>GrappaUnmixingGadget</classname>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+    -->
+    
+    <!--
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+     -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    -->
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/grappa/gadgetron_grappa_export.h b/gadgets/grappa/gadgetron_grappa_export.h
new file mode 100644
index 0000000..457a5bc
--- /dev/null
+++ b/gadgets/grappa/gadgetron_grappa_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_GRAPPA_EXPORT_H_
+#define GADGETRON_GRAPPA_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GRAPPA__)
+#define EXPORTGADGETSGRAPPA __declspec(dllexport)
+#else
+#define EXPORTGADGETSGRAPPA __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSGRAPPA
+#endif
+
+#endif /* GADGETRON_GRAPPA_EXPORT_H_ */
diff --git a/gadgets/gtPlus/CMakeLists.txt b/gadgets/gtPlus/CMakeLists.txt
new file mode 100644
index 0000000..51eb9a9
--- /dev/null
+++ b/gadgets/gtPlus/CMakeLists.txt
@@ -0,0 +1,99 @@
+
+include_directories(   
+    ${CMAKE_SOURCE_DIR}/gadgets/core
+    ${ACE_INCLUDE_DIR} 
+    ${Boost_INCLUDE_DIR}
+    ${ISMRMRD_INCLUDE_DIR}
+    ${ISMRMRD_SCHEMA_DIR}
+    ${ISMRMRD_XSD_INCLUDE_DIR}
+    ${XSD_INCLUDE_DIR}
+    ${FFTW3_INCLUDE_DIR}
+    ${ARMADILLO_INCLUDE_DIRS}
+    ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators
+    ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+    ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+    ${CMAKE_SOURCE_DIR}/gadgets/core
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+    ${HDF5_INCLUDE_DIR}
+    ${HDF5_INCLUDE_DIR}/cpp
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+    ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+    ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+    ${CMAKE_SOURCE_DIR}/gadgets/gtPlus 
+  )
+
+IF (WIN32)
+    ADD_DEFINITIONS(-DTIXML_USE_STL)
+    ADD_DEFINITIONS(-D__BUILD_GADGETS__)
+ENDIF (WIN32)
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+find_package(Ismrmrd REQUIRED)
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+add_library(gadgetronPlus SHARED
+            GtPlusGadgetExport.h
+            GtPlusGadgetImageArray.h
+            GtPlusGadgetImageArray.cpp
+            GtPlusAccumulatorWorkOrderTriggerGadget.h
+            GtPlusAccumulatorWorkOrderTriggerGadget.cpp
+            GtPlusGadgetOpenMP.h
+            GtPlusGadgetOpenMP.cpp
+            GtPlusReconGadget.h
+            GtPlusReconGadget.cpp
+            GtPlusRecon2DTGadget.h
+            GtPlusRecon2DTGadget.cpp
+            GtPlusRecon3DTGadget.h
+            GtPlusRecon3DTGadget.cpp
+            GtPlusRecon2DTGadgetCloud.h
+            GtPlusRecon2DTGadgetCloud.cpp
+            GtPlusRecon2DTCloudPackage.h
+            GadgetCloudJobMessageReadWrite.h
+            GadgetCloudJobMessageReadWrite.cpp
+            GtPlusReconJob2DTGadget.h 
+            GtPlusReconJob2DTGadget.cpp 
+            GtPlusReconJob3DTGadget.h 
+            GtPlusReconJob3DTGadget.cpp 
+            GtPlusReconJob2DTGadgetCloud.h 
+            GtPlusReconJob2DTGadgetCloud.cpp 
+            ${ISMRMRD_XSD_SOURCE} )
+
+target_link_libraries(gadgetronPlus 
+    cpucore 
+    cpucore_math 
+    gtplus 
+    gadgettools 
+    ${MKL_LIBRARIES} 
+    ${Boost_LIBRARIES}
+    ${ISMRMRD_LIBRARIES} ${FFTW3_LIBRARIES} 
+    optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+    ${XERCESC_LIBRARIES} )
+
+    if (CUDA_FOUND)
+        target_link_libraries(gadgetronPlus gpuparallelmri)
+    endif (CUDA_FOUND)
+
+install (FILES 
+        GtPlusGadgetExport.h
+        DESTINATION include)
+
+install(TARGETS gadgetronPlus DESTINATION lib)
+# install(FILES default.xml default_short.xml DESTINATION config)
diff --git a/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp
new file mode 100644
index 0000000..4e8ac2c
--- /dev/null
+++ b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.cpp
@@ -0,0 +1,11 @@
+
+#include "GadgetCloudJobMessageReadWrite.h"
+
+namespace Gadgetron
+{
+    GADGETRON_READER_FACTORY_DECLARE(GtPlusCloudJobMessageReaderCPFL)
+    GADGETRON_WRITER_FACTORY_DECLARE(GtPlusCloudJobMessageWriterCPFL)
+
+    GADGETRON_READER_FACTORY_DECLARE(GtPlus2DTGadgetCloudJobMessageReaderCPFL)
+    GADGETRON_WRITER_FACTORY_DECLARE(GtPlus2DTGadgetCloudJobMessageWriterCPFL)
+}
diff --git a/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h
new file mode 100644
index 0000000..16a946a
--- /dev/null
+++ b/gadgets/gtPlus/GadgetCloudJobMessageReadWrite.h
@@ -0,0 +1,246 @@
+/** \file   GadgetCloudJobMessageReaderWriter.h
+    \brief  Implement the reader/writer for GtPlus cloud job data package
+            This implementation requires the cloud job supports the serialize and deserialize functions
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusGadgetExport.h"
+
+#include "GadgetImageMessageReader.h"
+#include "GadgetImageMessageWriter.h"
+#include "gtPlusISMRMRDReconWorker.h"
+#include "GtPlusRecon2DTCloudPackage.h"
+
+namespace Gadgetron
+{
+
+    template <typename JobType> 
+    class GadgetCloudJobMessageReader : public GadgetMessageReader
+    {
+
+    public:
+        virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) 
+        {
+            GadgetContainerMessage<int>* jobID = new GadgetContainerMessage<int>();
+            GadgetContainerMessage<JobType>* job = new GadgetContainerMessage<JobType>();
+
+            jobID->cont(job);
+
+            int id = 0;
+            size_t sizeOfJob = 0;
+
+            ssize_t recv_count = 0;
+
+            if ((recv_count = stream->recv_n(&id, sizeof(int))) <= 0)
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudJobMessageReader, failed to read job id\n")) );
+                job->release();
+                return 0;
+            }
+
+            *(jobID->getObjectPtr()) = id;
+
+            if ((recv_count = stream->recv_n(&sizeOfJob, sizeof(size_t))) <= 0)
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudJobMessageReader, failed to read job size\n")) );
+                job->release();
+                return 0;
+            }
+
+            hoNDArray<char> jobBuf;
+            try
+            {
+                jobBuf.create(sizeOfJob);
+            }
+            catch(...)
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudJobMessageReader, failed to allocate memory\n")) );
+                job->release();
+                return 0;
+            }
+
+            size_t maxBytesPerSend = 512.0*1024*1024;
+
+            if ( sizeOfJob > maxBytesPerSend )
+            {
+                size_t receivedBytes = 0;
+                size_t receivingBytes = maxBytesPerSend;
+
+                while ( receivingBytes > 0 )
+                {
+                    if ((recv_count = stream->recv_n(jobBuf.get_data_ptr()+receivedBytes, receivingBytes)) <= 0)
+                    {
+                        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudJobMessageReader, failed to read data from socket\n")) );
+                        job->release();
+                        return 0;
+                    }
+
+                    receivedBytes += receivingBytes;
+                    if ( receivedBytes >= sizeOfJob ) break;
+
+                    if ( sizeOfJob-receivedBytes < maxBytesPerSend )
+                    {
+                        receivingBytes = sizeOfJob-receivedBytes;
+                    }
+                }
+            }
+            else
+            {
+                if ((recv_count = stream->recv_n(jobBuf.get_data_ptr(), sizeOfJob)) <= 0)
+                {
+                    ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudJobMessageReader, failed to read data from socket\n")) );
+                    job->release();
+                    return 0;
+                }
+            }
+
+            if ( !job->getObjectPtr()->deserialize(jobBuf.get_data_ptr(), sizeOfJob) ) return 0;
+
+            return jobID;
+        }
+    };
+
+    template <typename JobType> 
+    class GadgetCloudJobMessageWriter : public GadgetMessageWriter
+    {
+
+    public:
+
+        ACE_UINT16 msg_id_;
+
+        GadgetCloudJobMessageWriter() : msg_id_(GADGET_MESSAGE_CLOUD_JOB) {}
+
+        virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb) 
+        {
+
+            GadgetContainerMessage<int>* m1 = 
+                dynamic_cast< GadgetContainerMessage<int>* >(mb);
+
+            int jobID = *(m1->getObjectPtr());
+
+            GadgetContainerMessage<JobType>* job = 
+                dynamic_cast< GadgetContainerMessage<JobType>* >(mb->cont());
+
+            if (!job )
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), GadgetCloudJobMessageWriter invalid image message objects")) );
+                return -1;
+            }
+
+            ssize_t send_cnt = 0;
+            GadgetMessageIdentifier id;
+            id.id = msg_id_;
+
+            if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0)
+            {
+                ACE_DEBUG((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send job message identifier\n")));
+                return -1;
+            }
+
+            if ((send_cnt = sock->send_n (&jobID, sizeof(int))) <= 0)
+            {
+                ACE_DEBUG((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send job id\n")));
+                return -1;
+            }
+
+            size_t sizeOfJob=0;
+            char* buf = NULL;
+            if ( !job->getObjectPtr()->serialize(buf, sizeOfJob) )
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudJobMessageWriter, failed to serialize the job\n")) );
+                return -1;
+            }
+
+            if ((send_cnt = sock->send_n (&sizeOfJob, sizeof(size_t))) <= 0)
+            {
+                ACE_DEBUG ((LM_ERROR,
+                    ACE_TEXT ("(%P|%t) Unable to send job size\n")));
+                delete [] buf;
+                return -1;
+            }
+
+            GADGET_DEBUG2("--> send job, size of job : %f MBytes ... \n", sizeOfJob/1024.0/1024);
+
+            size_t maxBytesPerSend = 512.0*1024*1024;
+
+            if ( sizeOfJob > maxBytesPerSend )
+            {
+                size_t sentBytes = 0;
+                size_t sendingBytes = maxBytesPerSend;
+
+                while ( sendingBytes > 0 )
+                {
+                    if ((send_cnt = sock->send_n (buf+sentBytes, sendingBytes)) <= 0)
+                    {
+                        ACE_DEBUG ((LM_ERROR,
+                            ACE_TEXT ("(%P|%t) Unable to send job data\n")));
+                        delete [] buf;
+                        return -1;
+                    }
+
+                    sentBytes += sendingBytes;
+                    if ( sentBytes >= sizeOfJob ) break;
+
+                    if ( sizeOfJob-sentBytes < maxBytesPerSend )
+                    {
+                        sendingBytes = sizeOfJob-sentBytes;
+                    }
+                }
+            }
+            else
+            {
+                if ((send_cnt = sock->send_n (buf, sizeOfJob)) <= 0)
+                {
+                    ACE_DEBUG ((LM_ERROR,
+                        ACE_TEXT ("(%P|%t) Unable to send job data\n")));
+                    delete [] buf;
+                    return -1;
+                }
+            }
+
+            delete [] buf;
+
+            return 0;
+        }
+
+    };
+
+    typedef Gadgetron::gtPlus::gtPlusReconJob2DT< std::complex<float> > GtPlusReconJobTypeCPFL;
+
+    class EXPORTGTPLUSGADGET GtPlusCloudJobMessageReaderCPFL : public GadgetCloudJobMessageReader<GtPlusReconJobTypeCPFL>
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(GtPlusCloudJobMessageReaderCPFL);
+    };
+
+    class EXPORTGTPLUSGADGET GtPlusCloudJobMessageWriterCPFL : public GadgetCloudJobMessageWriter<GtPlusReconJobTypeCPFL>
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(GtPlusCloudJobMessageWriterCPFL);
+    };
+
+    // gadget level cloud computing
+
+    class EXPORTGTPLUSGADGET GtPlus2DTGadgetCloudJobMessageReaderCPFL : public GadgetCloudJobMessageReader<GtPlusRecon2DTCloudPackageCPFL>
+    {
+    public:
+        GADGETRON_WRITER_DECLARE(GtPlus2DTGadgetCloudJobMessageReaderCPFL);
+    };
+
+    class EXPORTGTPLUSGADGET GtPlus2DTGadgetCloudJobMessageWriterCPFL : public GadgetCloudJobMessageWriter<GtPlusRecon2DTCloudPackageCPFL>
+    {
+    public:
+
+        typedef GadgetCloudJobMessageWriter<GtPlusRecon2DTCloudPackageCPFL> BaseClass;
+
+        GtPlus2DTGadgetCloudJobMessageWriterCPFL() : BaseClass()
+        {
+            msg_id_ = GADGET_MESSAGE_GADGETCLOUD_JOB;
+        }
+
+        GADGETRON_WRITER_DECLARE(GtPlus2DTGadgetCloudJobMessageWriterCPFL);
+    };
+}
diff --git a/gadgets/gtPlus/GadgetMRIHeaders.cpp b/gadgets/gtPlus/GadgetMRIHeaders.cpp
new file mode 100644
index 0000000..72b6205
--- /dev/null
+++ b/gadgets/gtPlus/GadgetMRIHeaders.cpp
@@ -0,0 +1,262 @@
+
+#include "GadgetMRIHeaders.h"
+
+// --------------------------------------------------------------------
+
+LoopCounters::LoopCounters() 
+{
+    line = 0;
+    acquisition = 0;
+    slice = 0;
+    partition = 0;
+    echo = 0;
+    phase = 0;
+    repetition = 0;
+    set = 0;
+    segment = 0;
+    channel = 0;
+}
+
+LoopCounters::~LoopCounters() {}
+
+void LoopCounters::dump()
+{
+    std::cout << "[Line Cha Slice Partition Echo Phase Rep Set Seg] = [" 
+                    << line 
+                    << " " << channel 
+                    << " " << slice 
+                    << " " << partition 
+                    << " " << echo 
+                    << " " << phase 
+                    << " " << repetition 
+                    << " " << set 
+                    << " " << segment << "]" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+GadgetMessageAcquisition::GadgetMessageAcquisition() 
+{
+    flags = 0;
+    meas_uid = 0;
+    scan_counter = 0;
+    time_stamp = 0;
+    pmu_time_stamp = 0;
+    samples = 0;
+    channels = 0;
+    centre_column = 0;
+    position[0] = 0.0f; position[1] = 0.0f; position[2] = 0.0f;
+    quarternion[0] = 1.0f; quarternion[1] = 0.0f; quarternion[2] = 0.0f; quarternion[3] = 0.0f;
+    table_position = 0.0f;
+}
+
+GadgetMessageAcquisition::~GadgetMessageAcquisition() {}
+
+float GadgetMessageAcquisition::get_position(unsigned int index) 
+{
+    if (index < 3) 
+    {
+        return position[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageAcquisition::set_position(unsigned int index, float pos)
+{
+    if (index < 3)
+    {
+        position[index] = pos;
+    }
+}
+
+float GadgetMessageAcquisition::get_quarternion(unsigned int index) 
+{
+    if (index < 4) 
+    {
+        return quarternion[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageAcquisition::set_quarternion(unsigned int index, float quar)
+{
+    if (index < 4) 
+    {
+        quarternion[index] = quar;
+    }
+}
+
+void GadgetMessageAcquisition::dump()
+{
+    std::cout << "GadgetMessageAcquisition" << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    std::cout << "flags            : " << flags << std::endl;
+    std::cout << "meas_uid         : " << meas_uid << std::endl;
+    std::cout << "scan_counter     : " << scan_counter << std::endl;
+    std::cout << "time_stamp       : " << time_stamp << std::endl;
+    std::cout << "pmu_time_stamp   : " << pmu_time_stamp << std::endl;
+    std::cout << "samples          : " << samples << std::endl;
+    std::cout << "channels         : " << channels << std::endl;
+    std::cout << "position         : " << position[0] << " " << position[1] << " " << position[2] << std::endl;
+    std::cout << "quarternion      : " << quarternion[0] << " " << quarternion[1] << " " << quarternion[2] << " " << quarternion[3] << std::endl;
+    std::cout << "table_position   : " << table_position << std::endl;
+    std::cout << "idx     : ";            idx.dump();
+    std::cout << "min_idx : ";            min_idx.dump();
+    std::cout << "max_idx : ";            max_idx.dump();
+    std::cout << "----------------------------------------------------------" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+GadgetMessageImage::GadgetMessageImage()
+{
+    flags = 0;
+
+    matrix_size[0] = 0;
+    matrix_size[1] = 0;
+    matrix_size[2] = 0;
+
+    channels = 0;
+
+    position[0] = 0.0f;
+    position[1] = 0.0f;
+    position[2] = 0.0f;
+
+    quarternion[0] = 1.0f;
+    quarternion[1] = 0.0f;
+    quarternion[2] = 0.0f;
+    quarternion[3] = 0.0f;
+
+    table_position = 0.0f;
+
+    time_stamp = 0;
+    pmu_time_stamp = 0;
+    image_format = 0;
+    image_type = 0;
+    image_index = 0;
+    image_series_index = 0;
+}
+
+GadgetMessageImage::~GadgetMessageImage() {}
+
+void GadgetMessageImage::copy(GadgetMessageImage& aMessageImage)
+{
+    flags = aMessageImage.flags;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    channels = aMessageImage.channels;
+
+    position[0] = aMessageImage.position[0];
+    position[1] = aMessageImage.position[1];
+    position[2] = aMessageImage.position[2];
+
+    quarternion[0] = aMessageImage.quarternion[0];
+    quarternion[1] = aMessageImage.quarternion[1];
+    quarternion[2] = aMessageImage.quarternion[2];
+    quarternion[3] = aMessageImage.quarternion[3];
+
+    table_position = aMessageImage.table_position;
+
+    time_stamp = aMessageImage.time_stamp;
+    pmu_time_stamp = aMessageImage.pmu_time_stamp;
+    image_format = aMessageImage.image_format;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+}
+
+ACE_UINT16 GadgetMessageImage::get_matrix_size(unsigned int index) 
+{
+    if (index < 3) 
+    {
+        return matrix_size[index];
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void GadgetMessageImage::set_matrix_size(unsigned int index, ACE_UINT16 size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = size;
+    }
+}
+
+float GadgetMessageImage::get_position(unsigned int index) 
+{
+    if (index < 3) 
+    {
+        return position[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageImage::set_position(unsigned int index, float pos)
+{
+    if (index < 3)
+    {
+        position[index] = pos;
+    }
+}
+
+float GadgetMessageImage::get_quarternion(unsigned int index)
+{
+    if (index < 4)
+    {
+        return quarternion[index];
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+void GadgetMessageImage::set_quarternion(unsigned int index, float quar)
+{
+    if (index < 4)
+    {
+        quarternion[index] = quar;
+    }
+}
+
+void GadgetMessageImage::dumpInfo()
+{
+    std::cout << "flags                 : " << flags << std::endl;
+    std::cout << "matrix_size           : " << matrix_size[0] << " " << matrix_size[1] << " " << matrix_size[2] << std::endl;
+    std::cout << "channels              : " << channels << std::endl;
+    std::cout << "position              : " << position[0] << " " << position[1] << " " << position[2] << std::endl;
+    std::cout << "quarternion           : " << quarternion[0] << " " << quarternion[1] << " " << quarternion[2] << " " << quarternion[3] << std::endl;
+    std::cout << "table_position        : " << table_position << std::endl;
+    std::cout << "data_idx_min          : ";   data_idx_min.dump();
+    std::cout << "data_idx_max          : ";   data_idx_max.dump();
+    std::cout << "data_idx_current      : ";   data_idx_current.dump();
+    std::cout << "time_stamp            : " << time_stamp << std::endl;
+    std::cout << "pmu_time_stamp        : " << pmu_time_stamp << std::endl;
+    std::cout << "image_format          : " << image_format << std::endl;
+    std::cout << "image_type            : " << image_type << std::endl;
+    std::cout << "image_index           : " << image_index << std::endl;
+    std::cout << "image_series_index    : " << image_series_index << std::endl;
+}
+
+void GadgetMessageImage::dump()
+{
+    std::cout << "GadgetMessageImage" << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    dumpInfo();
+    std::cout << "----------------------------------------------------------" << std::endl;
+}
diff --git a/gadgets/gtPlus/GadgetMRIHeadersExt.cpp b/gadgets/gtPlus/GadgetMRIHeadersExt.cpp
new file mode 100644
index 0000000..9c0d23a
--- /dev/null
+++ b/gadgets/gtPlus/GadgetMRIHeadersExt.cpp
@@ -0,0 +1,428 @@
+
+#include "GadgetMRIHeadersExt.h"
+#include "GadgetIsmrmrdReadWrite.h"
+// #include <iostream>
+
+// --------------------------------------------------------------------
+
+GadgetMessageImageExt::GadgetMessageImageExt() : ISMRMRD::ImageHeader()
+{
+    time_stamps.clear();
+    pmu_time_stamps.clear();
+}
+
+GadgetMessageImageExt::~GadgetMessageImageExt() { }
+
+void GadgetMessageImageExt::set_matrix_size(unsigned int index, ACE_UINT16 size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = size;
+    }
+
+    if ( index == 1 )
+    {
+        time_stamps.clear();
+        time_stamps.resize(matrix_size[1], -1);
+        pmu_time_stamps.clear();
+        pmu_time_stamps.resize(matrix_size[1], -1);
+    }
+}
+
+void GadgetMessageImageExt::copy(GadgetMessageImageExt& aMessageImage)
+{
+    flags = aMessageImage.flags;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    channels = aMessageImage.channels;
+
+    position[0] = aMessageImage.position[0];
+    position[1] = aMessageImage.position[1];
+    position[2] = aMessageImage.position[2];
+
+    read_dir[0] = aMessageImage.read_dir[0];
+    read_dir[1] = aMessageImage.read_dir[1];
+    read_dir[2] = aMessageImage.read_dir[2];
+
+    phase_dir[0] = aMessageImage.phase_dir[0];
+    phase_dir[1] = aMessageImage.phase_dir[1];
+    phase_dir[2] = aMessageImage.phase_dir[2];
+
+    slice_dir[0] = aMessageImage.slice_dir[0];
+    slice_dir[1] = aMessageImage.slice_dir[1];
+    slice_dir[2] = aMessageImage.slice_dir[2];
+
+    patient_table_position[0] = aMessageImage.patient_table_position[0];
+    patient_table_position[1] = aMessageImage.patient_table_position[1];
+    patient_table_position[2] = aMessageImage.patient_table_position[2];
+
+    acquisition_time_stamp = aMessageImage.acquisition_time_stamp;
+
+    physiology_time_stamp[0] = aMessageImage.physiology_time_stamp[0];
+    physiology_time_stamp[1] = aMessageImage.physiology_time_stamp[1];
+    physiology_time_stamp[2] = aMessageImage.physiology_time_stamp[2];
+
+    image_data_type = aMessageImage.image_data_type;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+
+    memcpy(user_int, aMessageImage.user_int, sizeof(int32_t)*ISMRMRD_USER_INTS);
+    memcpy(user_float, aMessageImage.user_float, sizeof(float)*ISMRMRD_USER_FLOATS);
+
+    time_stamps = aMessageImage.time_stamps;
+    pmu_time_stamps = aMessageImage.pmu_time_stamps;
+}
+
+void GadgetMessageImageExt::dump()
+{
+    std::cout << "GadgetMessageImageExt" << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    //dumpInfo();
+    std::cout << "----------------------------------------------------------" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+// [Col Line Cha Slice Partition Echo Phase Rep Set Seg]
+//   0   1    2   3     4         5    6     7   8   9
+// store a scan with 10 dimensions
+GadgetMessageImageArray::GadgetMessageImageArray() 
+:   imageArray_(0),
+    kSpace_centre_col_no(0), 
+    kSpace_centre_line_no(0), 
+    kSpace_centre_partition_no(0), 
+    kSpace_max_acquired_col_no(0), 
+    kSpace_max_acquired_line_no(0), 
+    kSpace_max_acquired_partition_no(0)
+{
+
+}
+
+GadgetMessageImageArray::GadgetMessageImageArray(int aSize[10])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        unsigned int len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GadgetMessageImageExt[len];
+        }
+
+        kSpace_centre_col_no = matrix_size[0]/2;
+        kSpace_centre_line_no = matrix_size[1]/2;
+        kSpace_centre_partition_no = matrix_size[4]/2;
+
+        kSpace_max_acquired_col_no = matrix_size[0]-1;
+        kSpace_max_acquired_line_no = matrix_size[1]-1;
+        kSpace_max_acquired_partition_no = matrix_size[4]-1;
+    }
+    catch(...)
+    {
+        std::cout << "Failed in allocate imageArray_" << std::endl;
+    }
+}
+
+GadgetMessageImageArray::~GadgetMessageImageArray()
+{
+    if (imageArray_)
+    {
+        delete [] imageArray_;
+    }
+}
+
+void GadgetMessageImageArray::resize(int aSize[10])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        unsigned int len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( imageArray_ ) 
+        {
+            delete [] imageArray_;
+            imageArray_ = NULL;
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GadgetMessageImageExt[len];
+        }
+
+        kSpace_centre_col_no = matrix_size[0]/2;
+        kSpace_centre_line_no = matrix_size[1]/2;
+        kSpace_centre_partition_no = matrix_size[4]/2;
+
+        kSpace_max_acquired_col_no = matrix_size[0]-1;
+        kSpace_max_acquired_line_no = matrix_size[1]-1;
+        kSpace_max_acquired_partition_no = matrix_size[4]-1;
+    }
+    catch(...)
+    {
+        std::cout << "Failed in resize GadgetMessageImageArray " << std::endl;
+    }
+}
+
+void GadgetMessageImageArray::copy(GadgetMessageImageArray& imageArray)
+{
+    if (imageArray_) delete [] imageArray_;
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        matrix_size[ii] = imageArray.matrix_size[ii];
+    }
+
+    unsigned int len = 1;
+    for ( ii=3; ii<10; ii++ )
+    {
+        len *= matrix_size[ii];
+    }
+
+    kSpace_centre_col_no = imageArray.kSpace_centre_col_no;
+    kSpace_centre_line_no = imageArray.kSpace_centre_line_no;
+    kSpace_centre_partition_no = imageArray.kSpace_centre_partition_no;
+
+    kSpace_max_acquired_col_no = imageArray.kSpace_max_acquired_col_no;
+    kSpace_max_acquired_line_no = imageArray.kSpace_max_acquired_line_no;
+    kSpace_max_acquired_partition_no = imageArray.kSpace_max_acquired_partition_no;
+
+    if ( len > 0 )
+    {
+        imageArray_ = new GadgetMessageImageExt[len];
+    }
+
+    for ( unsigned int i=0; i<len; i++ )
+    {
+        imageArray_[i] = imageArray.imageArray_[i];
+    }
+}
+
+int GadgetMessageImageArray::get_offset(int slc, int par, int eco, int phs, int rep, int set, int seg)
+{
+    int offset = seg*matrix_size[8]*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + set*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + rep*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + phs*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + eco*matrix_size[4]*matrix_size[3]
+                    + par*matrix_size[3]
+                    + slc;
+    return offset;
+}
+
+void GadgetMessageImageArray::extractMessageImageArrayForSLC(int slc, GadgetMessageImageArray& imageArray)
+{
+    if ( slc >= matrix_size[3] )
+    {
+        std::cout << "extractMessageImageArrayForSLC error - slc >= matrix_size[3] " << std::endl;
+        return;
+    }
+
+    int aSize[10];
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        aSize[ii] = matrix_size[ii];
+    }
+
+    aSize[3] = 1;
+
+    imageArray.resize(aSize);
+
+    imageArray.kSpace_centre_col_no = kSpace_centre_col_no;
+    imageArray.kSpace_centre_line_no = kSpace_centre_line_no;
+    imageArray.kSpace_centre_partition_no = kSpace_centre_partition_no;
+    imageArray.kSpace_max_acquired_col_no = kSpace_max_acquired_col_no;
+    imageArray.kSpace_max_acquired_line_no = kSpace_max_acquired_line_no;
+    imageArray.kSpace_max_acquired_partition_no = kSpace_max_acquired_partition_no;
+
+    int par, eco, phs, rep, set, seg;
+
+    int PAR = matrix_size[4];
+    int ECO = matrix_size[5];
+    int PHS = matrix_size[6];
+    int REP = matrix_size[7];
+    int SET = matrix_size[8];
+    int SEG = matrix_size[9];
+
+    for ( seg=0; seg<SEG; seg++ )
+    {
+        for ( set=0; set<SET; set++ )
+        {
+            for ( rep=0; rep<REP; rep++ )
+            {
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    for ( eco=0; eco<ECO; eco++ )
+                    {
+                        for ( par=0; par<PAR; par++ )
+                        {
+                            int offset = this->get_offset(slc, par, eco, phs, rep, set, seg);
+                            int offsetSLC = imageArray.get_offset(0, par, eco, phs, rep, set, seg);
+
+                            imageArray.imageArray_[offsetSLC] = imageArray_[offset];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GadgetMessageImageArray::extractMessageImageArrayForREP(int rep, GadgetMessageImageArray& imageArray)
+{
+    if ( rep >= matrix_size[7] )
+    {
+        std::cout << "extractMessageImageArrayForSLC error - rep >= matrix_size[7] " << std::endl;
+        return;
+    }
+
+    int aSize[10];
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        aSize[ii] = matrix_size[ii];
+    }
+
+    aSize[7] = 1;
+
+    imageArray.resize(aSize);
+
+    imageArray.kSpace_centre_col_no = kSpace_centre_col_no;
+    imageArray.kSpace_centre_line_no = kSpace_centre_line_no;
+    imageArray.kSpace_centre_partition_no = kSpace_centre_partition_no;
+    imageArray.kSpace_max_acquired_col_no = kSpace_max_acquired_col_no;
+    imageArray.kSpace_max_acquired_line_no = kSpace_max_acquired_line_no;
+    imageArray.kSpace_max_acquired_partition_no = kSpace_max_acquired_partition_no;
+
+    int par, eco, phs, slc, set, seg;
+
+    int SLC = matrix_size[3];
+    int PAR = matrix_size[4];
+    int ECO = matrix_size[5];
+    int PHS = matrix_size[6];
+    int SET = matrix_size[8];
+    int SEG = matrix_size[9];
+
+    for ( seg=0; seg<SEG; seg++ )
+    {
+        for ( set=0; set<SET; set++ )
+        {
+            for ( slc=0; slc<SLC; slc++ )
+            {
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    for ( eco=0; eco<ECO; eco++ )
+                    {
+                        for ( par=0; par<PAR; par++ )
+                        {
+                            int offset = this->get_offset(slc, par, eco, phs, rep, set, seg);
+                            int offsetREP = imageArray.get_offset(slc, par, eco, phs, 0, set, seg);
+
+                            imageArray.imageArray_[offsetREP] = imageArray_[offset];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GadgetMessageImageArray::dump()
+{
+    unsigned int ii;
+    std::cout << "GadgetMessageImageArray" << std::endl;
+    std::cout << "==========================================================" << std::endl;
+    std::cout << "matrix_size           : ";
+    for ( ii=0; ii<10; ii++ )
+    {
+        std::cout << matrix_size[ii] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    std::cout << "kSpace_centre_col_no             : " << kSpace_centre_col_no << std::endl;
+    std::cout << "kSpace_max_acquired_col_no       : " << kSpace_max_acquired_col_no << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    std::cout << "kSpace_centre_line_no            : " << kSpace_centre_line_no << std::endl;
+    std::cout << "kSpace_max_acquired_line_no      : " << kSpace_max_acquired_line_no << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    std::cout << "kSpace_centre_partition_no       : " << kSpace_centre_partition_no << std::endl;
+    std::cout << "kSpace_max_acquired_partition_no : " << kSpace_max_acquired_partition_no << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    if ( imageArray_ )
+    {
+        int slc, par, eco, phs, rep, set, seg;
+        for ( seg=0; seg<matrix_size[9]; seg++ )
+        {
+            for ( set=0; set<matrix_size[8]; set++ )
+            {
+                for ( rep=0; rep<matrix_size[7]; rep++ )
+                {
+                    for ( phs=0; phs<matrix_size[6]; phs++ )
+                    {
+                        for ( eco=0; eco<matrix_size[5]; eco++ )
+                        {
+                            for ( par=0; par<matrix_size[4]; par++ )
+                            {
+                                for ( slc=0; slc<matrix_size[3]; slc++ )
+                                {
+                                    int offset = get_offset(slc, par, eco, phs, rep, set, seg);
+                                    std::cout << "[Slice Partition Echo Phase Rep Set Seg] = [" 
+                                                << " " << slc 
+                                                << " " << par 
+                                                << " " << eco 
+                                                << " " << phs 
+                                                << " " << rep 
+                                                << " " << set 
+                                                << " " << seg << "]" << std::endl;
+
+                                    imageArray_[offset].dump();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+    std::cout << "==========================================================" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+KSpaceBuffer::KSpaceBuffer() 
+: isIPAT(false) 
+{
+
+}
+
+KSpaceBuffer::~KSpaceBuffer()
+{
+
+}
diff --git a/gadgets/gtPlus/GadgetMRIHeadersExt.h b/gadgets/gtPlus/GadgetMRIHeadersExt.h
new file mode 100644
index 0000000..cf66c55
--- /dev/null
+++ b/gadgets/gtPlus/GadgetMRIHeadersExt.h
@@ -0,0 +1,231 @@
+#ifndef GADGETMRIHEADERSEXT_H
+#define GADGETMRIHEADERSEXT_H
+
+#include "gadgetronMrRecon_export.h"
+#include "GadgetronMrReconCommon.h"
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd.h"
+#include "core/basic/Common.h"
+#include <algorithm/MrRecon/basic/MrReconNDArray.h>
+
+#include <vector>
+
+/** @name OS and compiler version */
+//@{
+#ifdef _WIN32
+    // assume microsft visual c++ compiler if on windows
+    #define GADGETRON_FTK_VISUAL_CPP
+#elif defined WIN32
+    #define GADGETRON_FTK_VISUAL_CPP
+#elif defined WINDOWS
+    #define GADGETRON_FTK_VISUAL_CPP
+#else
+    // not the visual studio, maybe gcc
+    #define NOT_WIN32
+    #define GADGETRON_FTK_DEPRECATED
+#endif
+
+#ifdef GADGETRON_FTK_VISUAL_CPP
+    #if _MSC_VER >= 1300 // vc 7 or higher, only vc6 does not support template very well
+        #define GADGETRON_FTK_TEMPLATE_SUPPORT
+    #else
+        #ifndef GADGETRON_FTK_OLD_VC_FLAG
+            #define GADGETRON_FTK_OLD_VC_FLAG // vc 6 flag
+        #endif
+    #endif
+#elif defined NOT_WIN32 // gcc or others
+    #define GADGETRON_FTK_TEMPLATE_SUPPORT
+#endif
+
+// settings specific for microsoft compiler
+#ifdef GADGETRON_FTK_VISUAL_CPP
+    // disable warnings on 255 char debug symbols
+    #pragma warning (disable : 4786)
+
+    // disable warnings on exporting classes in DLL which has STL members
+    #pragma warning (disable : 4251)
+
+    // disable warnings on using 'this' in initializer list
+    #pragma warning (disable : 4355)
+
+    // disable warnings when specifying functions with a throw specifier
+    #pragma warning( disable : 4290 )
+
+    // disable warnings for implicit conversions
+    //#pragma warning( disable : 4244 )
+
+    // disable warnings for unknown pragma
+    #pragma warning( disable : 4068 )
+    
+    // disable warnings for unsafe functions
+    #pragma warning( disable : 4996 )
+
+    // disable warnings for warning C4275: non dll-interface class 
+    // 'std::_Complex_base<float>' used as base for dll-interface 
+    //class 'std::complex<float>'
+    #pragma warning( disable : 4275 )
+
+    /// disable warning for constant conditional expression
+    #pragma warning( disable : 4127)
+
+    /// disable warning for unreachable code
+    #pragma warning( disable : 4702)
+
+    /// 'identifier' : decorated name length exceeded, name was truncated
+    /// The decorated name was longer than the maximum the compiler allows (247), 
+    /// and was truncated. To avoid this warning and the truncation, reduce the number of arguments or name length of identifiers used.
+    #pragma warning( disable : 4503)
+
+    #pragma warning( disable : 4267)
+    #pragma warning( disable : 4244)
+    #pragma warning( disable : 4996)
+
+    // debug functionality
+    // #include <crtdbg.h>
+
+    // make code portable between VSS 6.0 and .NET
+    #if _MSC_VER >= 1300 // check for .NET
+    #define GADGETRON_FTK_DEPRECATED __declspec(deprecated)
+    #else
+    #define GADGETRON_FTK_DEPRECATED
+    #endif
+
+#endif
+//@}
+
+// -----------------------------------------------------------------
+// info zone
+
+enum PATRefScanMode
+{
+    PAT_REF_SCAN_UNDEFINED      = 0x01, // e.g. if no PAT is selected
+    PAT_REF_SCAN_INPLACE        = 0x02, // sequence supplies inplace reference lines
+    PAT_REF_SCAN_EXTRA          = 0x04, // sequence supplies extra reference lines
+    PAT_REF_SCAN_PRESCAN        = 0x08, // sequence does not supply reference lines, the data must have been acquired with a previous measurement
+    PAT_REF_SCAN_INTRINSIC_AVE  = 0x10, // The sequence contains intrinsic ref.lines due to sharing e.g. in the averages dimension
+    PAT_REF_SCAN_INTRINSIC_REP  = 0x20, // The sequence contains intrinsic ref.lines due to sharing e.g. in the repetition or phases dimension (i.e., TSENSE)
+    PAT_REF_SCAN_INTRINSIC_PHS  = 0x40, // The sequence contains intrinsic ref.lines due to sharing e.g. in the repetition or phases dimension (i.e., TSENSE)
+    PAT_REF_SCAN_INPLACE_LET    = 0x80  // A single (L)ong (E)cho (T)rain acquires reference lines and imaging lines
+};
+
+struct LoopCounters
+{
+    ACE_UINT16 line;
+    ACE_UINT16 acquisition;
+    ACE_UINT16 slice;
+    ACE_UINT16 partition;
+    ACE_UINT16 echo;
+    ACE_UINT16 phase;
+    ACE_UINT16 repetition;
+    ACE_UINT16 set;
+    ACE_UINT16 segment;
+    ACE_UINT16 channel;
+};
+
+#define MDH_FREEHDRPARA         4
+#define MDH_FREEHDRPARAOFFSET   4
+
+// aushIceProgramPara
+
+// in the user_int
+#define WIP_INDEX_TR                                MDH_FREEHDRPARAOFFSET+0
+#define WIP_INDEX_TE                                MDH_FREEHDRPARAOFFSET+1
+#define WIP_INDEX_FOV                               MDH_FREEHDRPARAOFFSET+2
+#define WIP_INDEX_SliceThickness                    MDH_FREEHDRPARAOFFSET+3
+
+// in the user_float
+#define WIP_INDEX_BaseResolution                    0
+#define WIP_INDEX_KernelSelection                   1
+#define WIP_INDEX_MoCoRecon                         2
+#define WIP_INDEX_AcceFactor                        3
+#define WIP_INDEX_NumRepForMoCo                     4
+#define WIP_INDEX_NumOfLine                         5
+
+// -----------------------------------------------------------------
+
+// [Col Line Cha Slice Partition Echo Phase Rep Set Seg]
+#ifdef VDimMrRecon
+    #undef VDimMrRecon
+#endif // VDimMrRecon
+#define  VDimMrRecon 10
+
+#ifdef BufferLengthMrRecon
+    #undef BufferLengthMrRecon
+#endif // BufferLengthMrRecon
+#define  BufferLengthMrRecon 2048
+
+struct  EXPORTGADGETSMRRECON GadgetMessageImageExt : public ISMRMRD::ImageHeader
+{
+    // fields added to store the time_stamp and pmu_time_stamp for every incoming read-out line
+    // if one line is not acquried, the corresponding time is -1
+    std::vector<int>     time_stamps;
+    std::vector<int>     pmu_time_stamps;
+
+    GadgetMessageImageExt();
+    ~GadgetMessageImageExt();
+
+    void copy(GadgetMessageImageExt& aMessageImage);
+    void set_matrix_size(unsigned int index, ACE_UINT16 size);
+    void dump();
+}; 
+
+// [Col Line Cha Slice Partition Echo Phase Rep Set Seg]
+//   0   1    2   3     4         5    6     7   8   9
+// store a scan with 10 dimensions
+struct  EXPORTGADGETSMRRECON GadgetMessageImageArray
+{
+    // size of the image array
+    ACE_UINT16 matrix_size[10];
+
+    // kspace center column number
+    ACE_UINT16 kSpace_centre_col_no;
+    // kspace max acquired col number
+    ACE_UINT16 kSpace_max_acquired_col_no;
+
+    // kspace center line number
+    ACE_UINT16 kSpace_centre_line_no;
+    // kspace max acquired line number
+    ACE_UINT16 kSpace_max_acquired_line_no;
+
+    // kspace center partition number
+    ACE_UINT16 kSpace_centre_partition_no;
+    // kspace max acquired partition number
+    ACE_UINT16 kSpace_max_acquired_partition_no;
+
+    // message information for every 2D image [Slice Partition Echo Phase Rep Set Seg]
+    GadgetMessageImageExt* imageArray_;
+
+    GadgetMessageImageArray();
+    GadgetMessageImageArray(int aSize[10]);
+    ~GadgetMessageImageArray();
+
+    void resize(int aSize[10]);
+    void copy(GadgetMessageImageArray& imageArray);
+    int get_offset(int slc, int par, int eco, int phs, int rep, int set, int seg);
+    void extractMessageImageArrayForSLC(int slc, GadgetMessageImageArray& imageArray);
+    void extractMessageImageArrayForREP(int rep, GadgetMessageImageArray& imageArray);
+
+    void dump();
+};
+
+struct EXPORTGADGETSMRRECON KSpaceBuffer
+{
+    typedef FTK_NAMESPACE_NAME::MrReconNDArray< std::complex<float>, VDimMrRecon > MrReconBufferType;
+
+    // kspace data
+    MrReconBufferType buffer_;
+
+    // reference ACS data
+    MrReconBufferType ref_;
+
+    // other data, e.g. AIF data
+    MrReconBufferType other_;
+
+    // whether it is ipat or pat with seperate ref
+    bool isIPAT;
+
+    KSpaceBuffer();
+    ~KSpaceBuffer();
+};
+
+#endif  //GADGETMRIHEADERSEXT_H
diff --git a/gadgets/gtPlus/GadgetronMrReconCommon.h b/gadgets/gtPlus/GadgetronMrReconCommon.h
new file mode 100644
index 0000000..853d61f
--- /dev/null
+++ b/gadgets/gtPlus/GadgetronMrReconCommon.h
@@ -0,0 +1,90 @@
+#ifndef GADGETRONMRRECONCOMMON_H
+#define GADGETRONMRRECONCOMMON_H
+
+/** @name OS and compiler version */
+//@{
+#ifdef _WIN32
+    // assume microsft visual c++ compiler if on windows
+    #define GADGETRON_FTK_VISUAL_CPP
+#elif defined WIN32
+    #define GADGETRON_FTK_VISUAL_CPP
+#elif defined WINDOWS
+    #define GADGETRON_FTK_VISUAL_CPP
+#else
+    // not the visual studio, maybe gcc
+    #define NOT_WIN32
+    #define GADGETRON_FTK_DEPRECATED
+#endif
+
+#ifdef GADGETRON_FTK_VISUAL_CPP
+    #if _MSC_VER >= 1300 // vc 7 or higher, only vc6 does not support template very well
+        #define GADGETRON_FTK_TEMPLATE_SUPPORT
+    #else
+        #ifndef GADGETRON_FTK_OLD_VC_FLAG
+            #define GADGETRON_FTK_OLD_VC_FLAG // vc 6 flag
+        #endif
+    #endif
+#elif defined NOT_WIN32 // gcc or others
+    #define GADGETRON_FTK_TEMPLATE_SUPPORT
+#endif
+
+// settings specific for microsoft compiler
+#ifdef GADGETRON_FTK_VISUAL_CPP
+    // disable warnings on 255 char debug symbols
+    #pragma warning (disable : 4786)
+
+    // disable warnings on exporting classes in DLL which has STL members
+    #pragma warning (disable : 4251)
+
+    // disable warnings on using 'this' in initializer list
+    #pragma warning (disable : 4355)
+
+    // disable warnings when specifying functions with a throw specifier
+    #pragma warning( disable : 4290 )
+
+    // disable warnings for implicit conversions
+    //#pragma warning( disable : 4244 )
+
+    // disable warnings for unknown pragma
+    #pragma warning( disable : 4068 )
+    
+    // disable warnings for unsafe functions
+    #pragma warning( disable : 4996 )
+
+    // disable warnings for warning C4275: non dll-interface class 
+    // 'std::_Complex_base<float>' used as base for dll-interface 
+    //class 'std::complex<float>'
+    #pragma warning( disable : 4275 )
+
+    /// disable warning for constant conditional expression
+    #pragma warning( disable : 4127)
+
+    /// disable warning for unreachable code
+    #pragma warning( disable : 4702)
+
+    /// 'identifier' : decorated name length exceeded, name was truncated
+    /// The decorated name was longer than the maximum the compiler allows (247), 
+    /// and was truncated. To avoid this warning and the truncation, reduce the number of arguments or name length of identifiers used.
+    #pragma warning( disable : 4503)
+
+    #pragma warning( disable : 4267)
+    #pragma warning( disable : 4244)
+    #pragma warning( disable : 4996)
+
+    // warning C4305: 'argument' : truncation
+    #pragma warning( disable : 4305)
+
+    // debug functionality
+    // #include <crtdbg.h>
+
+    // make code portable between VSS 6.0 and .NET
+    #if _MSC_VER >= 1300 // check for .NET
+    #define GADGETRON_FTK_DEPRECATED __declspec(deprecated)
+    #else
+    #define GADGETRON_FTK_DEPRECATED
+    #endif
+
+#endif
+//@}
+
+#endif  // GADGETRONMRRECONCOMMON_H
diff --git a/gadgets/gtPlus/GtPlusAccumulatorGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorGadget.cpp
new file mode 100644
index 0000000..2b0c813
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorGadget.cpp
@@ -0,0 +1,1168 @@
+#include "GtPlusAccumulatorGadget.h"
+
+namespace Gadgetron
+{
+
+// --------------------------------------------------------------------
+
+GadgetMessageImageExt::GadgetMessageImageExt() : ISMRMRD::ImageHeader()
+{
+    time_stamps.clear();
+    pmu_time_stamps.clear();
+}
+
+GadgetMessageImageExt::~GadgetMessageImageExt() { }
+
+void GadgetMessageImageExt::set_matrix_size(unsigned int index, ACE_UINT16 size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = size;
+    }
+
+    if ( index == 1 )
+    {
+        time_stamps.clear();
+        time_stamps.resize(matrix_size[1], -1);
+        pmu_time_stamps.clear();
+        pmu_time_stamps.resize(matrix_size[1], -1);
+    }
+}
+
+void GadgetMessageImageExt::copy(GadgetMessageImageExt& aMessageImage)
+{
+    flags = aMessageImage.flags;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    channels = aMessageImage.channels;
+
+    position[0] = aMessageImage.position[0];
+    position[1] = aMessageImage.position[1];
+    position[2] = aMessageImage.position[2];
+
+    read_dir[0] = aMessageImage.read_dir[0];
+    read_dir[1] = aMessageImage.read_dir[1];
+    read_dir[2] = aMessageImage.read_dir[2];
+
+    phase_dir[0] = aMessageImage.phase_dir[0];
+    phase_dir[1] = aMessageImage.phase_dir[1];
+    phase_dir[2] = aMessageImage.phase_dir[2];
+
+    slice_dir[0] = aMessageImage.slice_dir[0];
+    slice_dir[1] = aMessageImage.slice_dir[1];
+    slice_dir[2] = aMessageImage.slice_dir[2];
+
+    patient_table_position[0] = aMessageImage.patient_table_position[0];
+    patient_table_position[1] = aMessageImage.patient_table_position[1];
+    patient_table_position[2] = aMessageImage.patient_table_position[2];
+
+    acquisition_time_stamp = aMessageImage.acquisition_time_stamp;
+
+    physiology_time_stamp[0] = aMessageImage.physiology_time_stamp[0];
+    physiology_time_stamp[1] = aMessageImage.physiology_time_stamp[1];
+    physiology_time_stamp[2] = aMessageImage.physiology_time_stamp[2];
+
+    image_data_type = aMessageImage.image_data_type;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+
+    memcpy(user_int, aMessageImage.user_int, sizeof(int32_t)*ISMRMRD_USER_INTS);
+    memcpy(user_float, aMessageImage.user_float, sizeof(float)*ISMRMRD_USER_FLOATS);
+
+    time_stamps = aMessageImage.time_stamps;
+    pmu_time_stamps = aMessageImage.pmu_time_stamps;
+}
+
+void GadgetMessageImageExt::dump()
+{
+    std::cout << "GadgetMessageImageExt" << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    //dumpInfo();
+    std::cout << "----------------------------------------------------------" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+GadgetMessageImageArray::GadgetMessageImageArray() 
+:   imageArray_(0)
+{
+
+}
+
+GadgetMessageImageArray::GadgetMessageImageArray(int aSize[10])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        unsigned int len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GadgetMessageImageExt[len];
+        }
+    }
+    catch(...)
+    {
+        std::cout << "Failed in allocate imageArray_" << std::endl;
+    }
+}
+
+GadgetMessageImageArray::~GadgetMessageImageArray()
+{
+    if (imageArray_)
+    {
+        delete [] imageArray_;
+    }
+}
+
+void GadgetMessageImageArray::resize(int aSize[10])
+{
+    try
+    {
+        unsigned int ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        unsigned int len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( imageArray_ ) 
+        {
+            delete [] imageArray_;
+            imageArray_ = NULL;
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GadgetMessageImageExt[len];
+        }
+    }
+    catch(...)
+    {
+        std::cout << "Failed in resize GadgetMessageImageArray " << std::endl;
+    }
+}
+
+void GadgetMessageImageArray::copy(GadgetMessageImageArray& imageArray)
+{
+    if (imageArray_) delete [] imageArray_;
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        matrix_size[ii] = imageArray.matrix_size[ii];
+    }
+
+    unsigned int len = 1;
+    for ( ii=3; ii<10; ii++ )
+    {
+        len *= matrix_size[ii];
+    }
+
+    if ( len > 0 )
+    {
+        imageArray_ = new GadgetMessageImageExt[len];
+    }
+
+    for ( unsigned int i=0; i<len; i++ )
+    {
+        imageArray_[i] = imageArray.imageArray_[i];
+    }
+}
+
+int GadgetMessageImageArray::get_offset(int slc, int e2, int con, int phs, int rep, int set, int seg)
+{
+    int offset = seg*matrix_size[8]*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + set*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + rep*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + phs*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + con*matrix_size[4]*matrix_size[3]
+                    + e2*matrix_size[3]
+                    + slc;
+    return offset;
+}
+
+void GadgetMessageImageArray::extractMessageImageArrayForSLC(int slc, GadgetMessageImageArray& imageArray)
+{
+    if ( slc >= matrix_size[3] )
+    {
+        std::cout << "extractMessageImageArrayForSLC error - slc >= matrix_size[3] " << std::endl;
+        return;
+    }
+
+    int aSize[10];
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        aSize[ii] = matrix_size[ii];
+    }
+
+    aSize[3] = 1;
+
+    imageArray.resize(aSize);
+
+    int e2, con, phs, rep, set, seg;
+
+    int E2 = matrix_size[4];
+    int CON = matrix_size[5];
+    int PHS = matrix_size[6];
+    int REP = matrix_size[7];
+    int SET = matrix_size[8];
+    int SEG = matrix_size[9];
+
+    for ( seg=0; seg<SEG; seg++ )
+    {
+        for ( set=0; set<SET; set++ )
+        {
+            for ( rep=0; rep<REP; rep++ )
+            {
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    for ( con=0; con<CON; con++ )
+                    {
+                        for ( e2=0; e2<E2; e2++ )
+                        {
+                            int offset = this->get_offset(slc, e2, con, phs, rep, set, seg);
+                            int offsetSLC = imageArray.get_offset(0, e2, con, phs, rep, set, seg);
+
+                            imageArray.imageArray_[offsetSLC] = imageArray_[offset];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GadgetMessageImageArray::extractMessageImageArrayForREP(int rep, GadgetMessageImageArray& imageArray)
+{
+    if ( rep >= matrix_size[7] )
+    {
+        std::cout << "extractMessageImageArrayForSLC error - rep >= matrix_size[7] " << std::endl;
+        return;
+    }
+
+    int aSize[10];
+
+    unsigned int ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        aSize[ii] = matrix_size[ii];
+    }
+
+    aSize[7] = 1;
+
+    imageArray.resize(aSize);
+
+    int e2, con, phs, slc, set, seg;
+
+    int SLC = matrix_size[3];
+    int E2 = matrix_size[4];
+    int CON = matrix_size[5];
+    int PHS = matrix_size[6];
+    int SET = matrix_size[8];
+    int SEG = matrix_size[9];
+
+    for ( seg=0; seg<SEG; seg++ )
+    {
+        for ( set=0; set<SET; set++ )
+        {
+            for ( slc=0; slc<SLC; slc++ )
+            {
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    for ( con=0; con<CON; con++ )
+                    {
+                        for ( e2=0; e2<E2; e2++ )
+                        {
+                            int offset = this->get_offset(slc, e2, con, phs, rep, set, seg);
+                            int offsetREP = imageArray.get_offset(slc, e2, con, phs, 0, set, seg);
+
+                            imageArray.imageArray_[offsetREP] = imageArray_[offset];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void GadgetMessageImageArray::dump()
+{
+    unsigned int ii;
+    std::cout << "GadgetMessageImageArray" << std::endl;
+    std::cout << "==========================================================" << std::endl;
+    std::cout << "matrix_size           : ";
+    for ( ii=0; ii<10; ii++ )
+    {
+        std::cout << matrix_size[ii] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    if ( imageArray_ )
+    {
+        int slc, e2, con, phs, rep, set, seg;
+        for ( seg=0; seg<matrix_size[9]; seg++ )
+        {
+            for ( set=0; set<matrix_size[8]; set++ )
+            {
+                for ( rep=0; rep<matrix_size[7]; rep++ )
+                {
+                    for ( phs=0; phs<matrix_size[6]; phs++ )
+                    {
+                        for ( con=0; con<matrix_size[5]; con++ )
+                        {
+                            for ( e2=0; e2<matrix_size[4]; e2++ )
+                            {
+                                for ( slc=0; slc<matrix_size[3]; slc++ )
+                                {
+                                    int offset = get_offset(slc, e2, con, phs, rep, set, seg);
+                                    std::cout << "[Slice E2 Contrast Phase Rep Set Seg] = [" 
+                                                << " " << slc 
+                                                << " " << e2 
+                                                << " " << con 
+                                                << " " << phs 
+                                                << " " << rep 
+                                                << " " << set 
+                                                << " " << seg << "]" << std::endl;
+
+                                    imageArray_[offset].dump();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+    std::cout << "==========================================================" << std::endl;
+}
+
+// --------------------------------------------------------------------
+
+KSpaceBuffer::KSpaceBuffer() 
+{
+
+}
+
+KSpaceBuffer::~KSpaceBuffer()
+{
+
+}
+
+// --------------------------------------------------------------------
+
+GtPlusAccumulatorGadget::GtPlusAccumulatorGadget()
+    : messageImage_(0)
+    , kspaceBuffer_(0)
+    , image_counter_(0)
+    , image_series_(0)
+    , triggered_(false)
+{
+
+}
+
+GtPlusAccumulatorGadget::~GtPlusAccumulatorGadget()
+{
+    if (messageImage_) delete messageImage_;
+    if (kspaceBuffer_) delete kspaceBuffer_;
+}
+
+// extract necessary configuration information from the xml
+int GtPlusAccumulatorGadget::process_config(ACE_Message_Block* mb)
+{
+
+    // allocate the kspace buffer
+    if ( kspaceBuffer_ == NULL )
+    {
+        if (!(kspaceBuffer_ = new KSpaceBuffer)) 
+        {
+            GADGET_DEBUG1("Failed create buffer\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // image series
+    image_series_ = this->get_int_value("image_series");
+
+    // pass the xml file
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    // seq object
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1)
+    {
+        GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+        GADGET_DEBUG1("This simple GtPlusAccumulatorGadget only supports one encoding space\n");
+        return GADGET_FAIL;
+    }
+
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    GADGET_MSG("Matrix size: " << e_space.matrixSize().x() << " " << e_space.matrixSize().y() << " " << e_space.matrixSize().z());
+    GADGET_MSG("Recon size: " << r_space.matrixSize().x() << " " << r_space.matrixSize().y() << " " << r_space.matrixSize().z());
+
+    meas_max_ro_ = e_space.matrixSize().x()/2;
+
+    field_of_view_[0] = r_space.fieldOfView_mm().x();
+    field_of_view_[1] = r_space.fieldOfView_mm().y();
+    field_of_view_[2] = r_space.fieldOfView_mm().z();
+    GADGET_MSG("field_of_view_ is " << field_of_view_[0] << " " << field_of_view_[1] << " " << field_of_view_[2]);
+
+    int newE1_ = field_of_view_[1]/(field_of_view_[0]/meas_max_ro_);
+
+    if (e_limits.kspace_encoding_step_1().present()) 
+    {
+        meas_max_idx_.kspace_encode_step_1 = e_limits.kspace_encoding_step_1().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.kspace_encode_step_1 = 0;
+        std::cout << "Setting number of kspace_encode_step_1 to 0" << std::endl;
+        return GADGET_FAIL;
+    }
+
+    kspaceBuffer_->kSpaceCentreEncode1_ = e_limits.kspace_encoding_step_1().get().center();
+    GADGET_MSG("kSpaceCentreEncode1_ is " << kspaceBuffer_->kSpaceCentreEncode1_);
+
+    kspaceBuffer_->kSpaceCentreEncode2_ = e_limits.kspace_encoding_step_2().get().center();
+    GADGET_MSG("kSpaceCentreEncode2_ is " << kspaceBuffer_->kSpaceCentreEncode2_);
+
+    kspaceBuffer_->kSpaceMaxEncode1_ = e_limits.kspace_encoding_step_1().get().maximum()+1;
+    GADGET_MSG("kSpaceMaxEncode1_ is " << kspaceBuffer_->kSpaceMaxEncode1_);
+
+    kspaceBuffer_->kSpaceMaxEncode2_ = e_limits.kspace_encoding_step_2().get().maximum()+1;
+    GADGET_MSG("kSpaceMaxEncode2_ is " << kspaceBuffer_->kSpaceMaxEncode2_);
+
+    if (e_limits.set().present())
+    {
+        meas_max_idx_.set = e_limits.set().get().maximum() - 1;
+        if ( meas_max_idx_.set < 0 ) meas_max_idx_.set = 0;
+    }
+    else
+    {
+        meas_max_idx_.set = 0;
+    }
+
+    if (e_limits.phase().present())
+    {
+        meas_max_idx_.phase = e_limits.phase().get().maximum()-1;
+        if ( meas_max_idx_.phase < 0 ) meas_max_idx_.phase = 0;
+    }
+    else
+    {
+        meas_max_idx_.phase = 0;
+    }
+
+    if (e_limits.kspace_encoding_step_2().present())
+    {
+        meas_max_idx_.kspace_encode_step_2 = e_limits.kspace_encoding_step_2().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.kspace_encode_step_2 = 0;
+    }
+
+    if (e_limits.contrast().present())
+    {
+        meas_max_idx_.contrast = e_limits.contrast().get().maximum()-1;
+        if ( meas_max_idx_.contrast < 0 ) meas_max_idx_.contrast = 0;
+    }
+    else
+    {
+        meas_max_idx_.contrast = 0;
+    }
+
+    if (e_limits.slice().present())
+    {
+        meas_max_idx_.slice = e_limits.slice().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.slice = 0;
+    }
+
+    if (e_limits.repetition().present())
+    {
+        meas_max_idx_.repetition = e_limits.repetition().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.repetition = 0;
+    }
+
+    if (e_limits.segment().present())
+    {
+        // meas_max_idx_.segment = e_limits.segment().get().maximum()-1;
+        meas_max_idx_.segment = 0;
+    }
+    else
+    {
+        meas_max_idx_.segment = 0;
+    }
+
+    // find out the PAT mode
+    ISMRMRD::ismrmrdHeader::parallelImaging_optional p_imaging_type = cfg->parallelImaging();
+    ISMRMRD::parallelImagingType p_imaging = *p_imaging_type;
+
+    kspaceBuffer_->AccelFactE1_ = (unsigned int)(p_imaging.accelerationFactor().kspace_encoding_step_1());
+    kspaceBuffer_->AccelFactE2_ = (unsigned int)(p_imaging.accelerationFactor().kspace_encoding_step_2());
+    GADGET_MSG("AccelFactE1 is " << kspaceBuffer_->AccelFactE1_);
+    GADGET_MSG("AccelFactE2 is " << kspaceBuffer_->AccelFactE2_);
+
+    ISMRMRD::calibrationModeType calib = *(p_imaging.calibrationMode());
+    kspaceBuffer_->CalibMode_ = calib;
+
+    // find out the calibration mode
+    if ( kspaceBuffer_->CalibMode_ == ISMRMRD::calibrationModeType::separate )
+    {
+        GADGET_MSG("Calibration mode is separate");
+    }
+
+    if ( kspaceBuffer_->CalibMode_ == ISMRMRD::calibrationModeType::embedded )
+    {
+        GADGET_MSG("Calibration mode is embedded");
+    }
+
+    if ( kspaceBuffer_->CalibMode_ == ISMRMRD::calibrationModeType::interleaved )
+    {
+        GADGET_MSG("Calibration mode is interleaved");
+
+        if ( p_imaging.interleavingDimension().present() )
+        {
+            kspaceBuffer_->InterleaveDim_ = *(p_imaging.interleavingDimension());
+            GADGET_MSG("InterleaveDim is " << kspaceBuffer_->InterleaveDim_);
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusAccumulatorGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // logic to control whether to store kspace and ref data
+    bool bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther;
+    if ( !checkStatus(m1->getObjectPtr()->flags, m1->getObjectPtr()->number_of_samples, bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther) )
+    {
+        GADGET_DEBUG1("Failed check readout status\n");
+        return GADGET_FAIL;
+    }
+
+    // store kspace read out
+    if ( bIsKSpace )
+    {
+        if ( !storeImageData(m1, m2, bIsReflect) )
+        {
+            GADGET_DEBUG1("Failed check readout status\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // store ref read out
+    if ( bIsRef )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        refBuffer_.push_back(item);
+    }
+
+    // store phaseCorr read out
+    if ( bIsPhaseCorr )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        phaseCorrBuffer_.push_back(item);
+    }
+
+    // store noise read out
+    if ( bIsNoise )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        noiseBuffer_.push_back(item);
+    }
+
+    // store other read out
+    if ( bIsOther )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        otherBuffer_.push_back(item);
+    }
+
+    m1->release();
+    return GADGET_OK;
+}
+
+bool GtPlusAccumulatorGadget::checkStatus(uint64_t flag, int samples, bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther)
+{
+    bIsNoise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(flag);
+    bool is_ref = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_PARALLEL_CALIBRATION).isSet(flag);
+    bool is_ref_kspace = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_PARALLEL_CALIBRATION_AND_IMAGING).isSet(flag);
+    bIsReflect = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_REVERSE).isSet(flag);
+    bIsPhaseCorr = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_PHASECORR_DATA).isSet(flag);
+
+    bIsKSpace = false;
+    bIsRef = false;
+    bIsOther = false;
+
+    if ( bIsNoise || bIsPhaseCorr )
+    {
+        return true;
+    }
+
+    // in interleaved mode, only store the image data
+    if ( kspaceBuffer_->CalibMode_==ISMRMRD::calibrationModeType::interleaved )
+    {
+        bIsKSpace = true;
+        bIsRef = false;
+    }
+
+    // in embedded, kspace stores only the undersampled lines
+    // ref stores all lines used for references
+    if ( kspaceBuffer_->CalibMode_==ISMRMRD::calibrationModeType::embedded )
+    {
+        if ( is_ref && !is_ref_kspace )
+        {
+            bIsKSpace = false;
+            bIsRef = true;
+        }
+
+        if ( !is_ref && is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = true;
+        }
+
+        if ( is_ref && is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = true;
+        }
+
+        if ( !is_ref && !is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = false;
+        }
+    }
+
+    // in separate mode
+    if ( kspaceBuffer_->CalibMode_==ISMRMRD::calibrationModeType::separate 
+    || kspaceBuffer_->CalibMode_==ISMRMRD::calibrationModeType::external )
+    {
+        if ( is_ref )
+        {
+            bIsKSpace = false;
+            bIsRef = true;
+        }
+
+        if ( !is_ref )
+        {
+            bIsKSpace = true;
+            bIsRef = false;
+        }
+    }
+
+    // store other data, e.g. AIF
+    // only for tpat
+    if ( !is_ref && !is_ref_kspace && (samples != meas_max_ro_) )
+    {
+        bIsOther = true;
+        bIsKSpace = false;
+        bIsRef = false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorGadget::storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect)
+{
+    try
+    {
+        unsigned int ii;
+        int samples =  m1->getObjectPtr()->number_of_samples;
+        ISMRMRD::EncodingCounters idx = m1->getObjectPtr()->idx;
+
+        if ( kspaceBuffer_->buffer_.get_number_of_elements() <= 0 )
+        {
+            meas_max_channel_ = m1->getObjectPtr()->active_channels;
+
+            int E1 = 2*kspaceBuffer_->kSpaceCentreEncode1_;
+            int E2 = 2*kspaceBuffer_->kSpaceCentreEncode2_;
+
+            // find the loop counter boundary and allocate the buffer
+            GADGET_MSG("[RO E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+                               << meas_max_ro_ 
+                               << " " << E1 
+                               << " " << meas_max_channel_ 
+                               << " " << meas_max_idx_.slice+1 
+                               << " " << E2 
+                               << " " << meas_max_idx_.contrast+1 
+                               << " " << meas_max_idx_.phase+1 
+                               << " " << meas_max_idx_.repetition+1 
+                               << " " << meas_max_idx_.set+1 
+                               << " " << meas_max_idx_.segment+1 << "]");
+
+            dimensions_.push_back(meas_max_ro_);
+            dimensions_.push_back(E1);
+            dimensions_.push_back(meas_max_channel_);
+            dimensions_.push_back(meas_max_idx_.slice+1);
+            dimensions_.push_back(E2);
+            dimensions_.push_back(meas_max_idx_.contrast+1);
+            dimensions_.push_back(meas_max_idx_.phase+1);
+            dimensions_.push_back(meas_max_idx_.repetition+1);
+            dimensions_.push_back(meas_max_idx_.set+1);
+            dimensions_.push_back(meas_max_idx_.segment+1);
+
+            unsigned int N = dimensions_.size();
+            for ( ii=0; ii<N; ii++ )
+            {
+                GADGET_MSG("dimensions_[" << ii << "] = " << dimensions_[ii]);
+            }
+
+            // allocate data buffer
+            try
+            {
+                kspaceBuffer_->buffer_.create(&dimensions_);
+
+                std::vector<unsigned int> reflect_dimensions_(dimensions_);
+                reflect_dimensions_[0] = 1;
+                reflect_dimensions_[2] = 1;
+                kspaceBuffer_->reflect_.create(&reflect_dimensions_);
+            }
+            catch(...)
+            {
+                GADGET_DEBUG1("Failed create buffer\n");
+                return false;
+            }
+
+            // allocate message buffer
+            int matrix_size[10];
+            for ( ii=0; ii<10; ii++ )
+            {
+                matrix_size[ii] = dimensions_[ii];
+            }
+
+            if (!(messageImage_ = new GadgetMessageImageArray(matrix_size))) 
+            {
+                GADGET_DEBUG1("Failed create buffer\n");
+                return false;
+            }
+        }
+
+        std::complex<float>* b = kspaceBuffer_->buffer_.begin();
+        std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+        if (samples != static_cast<int>(dimensions_[0])) 
+        {
+            GADGET_DEBUG1("Wrong number of samples received\n");
+            return false;
+        }
+
+        //Copy the data for all the channels
+        std::vector<unsigned int> pos(10);
+        for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) 
+        {
+            pos[0] = 0;
+            pos[1] = idx.kspace_encode_step_1;
+            pos[2] = c;
+            pos[3] = idx.slice;
+            pos[4] = idx.kspace_encode_step_2;
+            pos[5] = idx.contrast;
+            pos[6] = idx.phase;
+            pos[7] = idx.repetition;
+            pos[8] = idx.set;
+            pos[9] = idx.segment;
+            int offsetBuffer = kspaceBuffer_->buffer_.calculate_offset(pos);
+
+            memcpy(b+offsetBuffer, d+c*samples, sizeof(std::complex<float>)*samples);
+
+            pos[2] = 0;
+            offsetBuffer = kspaceBuffer_->reflect_.calculate_offset(pos);
+            kspaceBuffer_->reflect_.at(offsetBuffer) = isReflect;
+        }
+
+        if ( !fillImageInfo(m1, messageImage_, m1->getObjectPtr()->idx) )
+        {
+            GADGET_DEBUG1("Failed in fillImageInfo(m1, messageImage_, m1->getObjectPtr()->idx)\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorGadget::storeImageData(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorGadget::
+fillBuffer(ReadOutBufferType& readOutBuffer, BufferType& buf, ReflectBufferType& reflectBuf)
+{
+    try
+    {
+        // find the maximal dimension of all buffered ICE readout
+        unsigned int numOfReadOuts = readOutBuffer.size();
+        ISMRMRD::EncodingCounters max_idx;
+        max_idx.kspace_encode_step_1 = 0;
+        max_idx.average = 0;
+        max_idx.slice = 0;
+        max_idx.kspace_encode_step_2 = 0;
+        max_idx.contrast = 0;
+        max_idx.phase = 0;
+        max_idx.repetition = 0;
+        max_idx.set = 0;
+        max_idx.segment = 0;
+        int max_channel = 0;
+        int max_col = 0;
+
+        unsigned int a;
+        for (a = 0; a < numOfReadOuts; a++) 
+        {
+            ISMRMRD::EncodingCounters idx = readOutBuffer[a].acqHead_.idx;
+
+            if ( readOutBuffer[a].acqHead_.number_of_samples > max_col ) 
+                max_col=readOutBuffer[a].acqHead_.number_of_samples;
+
+            if ( idx.kspace_encode_step_1 > max_idx.kspace_encode_step_1 ) 
+                max_idx.kspace_encode_step_1=idx.kspace_encode_step_1;
+
+            if ( idx.slice > max_idx.slice ) 
+                max_idx.slice = idx.slice;
+
+            if ( idx.kspace_encode_step_2 > max_idx.kspace_encode_step_2 ) 
+                max_idx.kspace_encode_step_2 = idx.kspace_encode_step_2;
+
+            if ( idx.contrast > max_idx.contrast ) 
+                max_idx.contrast = idx.contrast;
+
+            if ( idx.phase > max_idx.phase ) 
+                max_idx.phase = idx.phase;
+
+            if ( idx.repetition > max_idx.repetition ) 
+                max_idx.repetition = idx.repetition;
+
+            if ( idx.set > max_idx.set ) 
+                max_idx.set = idx.set;
+
+            if ( idx.segment > max_idx.segment ) 
+                max_idx.segment = idx.segment;
+
+            if ( readOutBuffer[a].acqHead_.active_channels > max_channel ) 
+                max_channel = readOutBuffer[a].acqHead_.active_channels;
+        }
+
+        GADGET_MSG("[RO E1 Cha Slice E2 Contrast Phase Rep Set Seg] = [" 
+                               << max_col 
+                               << " " << max_idx.kspace_encode_step_1+1 
+                               << " " << max_channel 
+                               << " " << max_idx.slice+1 
+                               << " " << max_idx.kspace_encode_step_2+1 
+                               << " " << max_idx.contrast+1 
+                               << " " << max_idx.phase+1 
+                               << " " << max_idx.repetition+1 
+                               << " " << max_idx.set+1 
+                               << " " << max_idx.segment+1 << "]");
+
+        // alloate buffer for data
+        std::vector<unsigned int> dims(10);
+        dims[0] = max_col;
+        dims[1] = max_idx.kspace_encode_step_1+1;
+        dims[2] = max_channel;
+        dims[3] = max_idx.slice+1;
+        dims[4] = max_idx.kspace_encode_step_2+1;
+        dims[5] = max_idx.contrast+1;
+        dims[6] = max_idx.phase+1;
+        dims[7] = max_idx.repetition+1;
+        dims[8] = max_idx.set+1;
+        dims[9] = max_idx.segment+1;
+
+        try
+        {
+            buf.create(&dims);
+
+            std::vector<unsigned int> reflect_dims(dims);
+            reflect_dims[0] = 1;
+            reflect_dims[2] = 1;
+            reflectBuf.create(&reflect_dims);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Failed create buffer\n");
+            return false;
+        }
+
+        std::complex<float>* b = buf.begin();
+
+        // copy the data
+        int c;
+        std::vector<unsigned int> pos(10);
+
+        for ( a=0; a<numOfReadOuts; a++) 
+        {
+            ISMRMRD::EncodingCounters idx = readOutBuffer[a].acqHead_.idx;
+            std::complex<float>* d = const_cast<std::complex<float>*>(readOutBuffer[a].data_.begin());
+
+            for ( c=0; c<readOutBuffer[a].acqHead_.active_channels; c++) 
+            {
+                pos[0] = 0;
+                pos[1] = idx.kspace_encode_step_1;
+                pos[2] = c;
+                pos[3] = idx.slice;
+                pos[4] = idx.kspace_encode_step_2;
+                pos[5] = idx.contrast;
+                pos[6] = idx.phase;
+                pos[7] = idx.repetition;
+                pos[8] = idx.set;
+                pos[9] = idx.segment;
+                int offsetBuffer = buf.calculate_offset(pos);
+
+                memcpy(b+offsetBuffer, d+c*readOutBuffer[a].acqHead_.number_of_samples, sizeof(std::complex<float>)*readOutBuffer[a].acqHead_.number_of_samples);
+
+                pos[2] = 0;
+                offsetBuffer = reflectBuf.calculate_offset(pos);
+                reflectBuf.at(offsetBuffer) = readOutBuffer[a].isReflect_;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorGadget::fillBuffer(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorGadget::fillImageInfo(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetMessageImageArray* messageImage, const ISMRMRD::EncodingCounters& idx)
+{
+    try
+    {
+        // fill the message info
+        int offset = messageImage->get_offset(idx.slice, idx.kspace_encode_step_2, idx.contrast, idx.phase, idx.repetition, idx.set, idx.segment);
+
+        // if it is the first acq in a slice, fill in all information
+        bool is_first_acq_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_FIRST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+        if ( is_first_acq_in_slice )
+        {
+            messageImage->imageArray_[offset].version = m1->getObjectPtr()->version;
+            messageImage->imageArray_[offset].flags = m1->getObjectPtr()->flags;
+            messageImage->imageArray_[offset].measurement_uid = m1->getObjectPtr()->measurement_uid;
+
+            //messageImage->imageArray_[offset].matrix_size[0] = dimensions_[0];
+            //messageImage->imageArray_[offset].matrix_size[1] = dimensions_[1];
+            //messageImage->imageArray_[offset].matrix_size[2] = dimensions_[2];
+
+            messageImage->imageArray_[offset].set_matrix_size(0, dimensions_[0]);
+            messageImage->imageArray_[offset].set_matrix_size(1, dimensions_[1]);
+            messageImage->imageArray_[offset].set_matrix_size(2, dimensions_[2]);
+
+            messageImage->imageArray_[offset].field_of_view[0] = field_of_view_[0];
+            messageImage->imageArray_[offset].field_of_view[1] = field_of_view_[1];
+            messageImage->imageArray_[offset].field_of_view[2] = field_of_view_[2];
+
+            messageImage->imageArray_[offset].channels = m1->getObjectPtr()->active_channels;
+
+            messageImage->imageArray_[offset].position[0] = m1->getObjectPtr()->position[0];
+            messageImage->imageArray_[offset].position[1] = m1->getObjectPtr()->position[1];
+            messageImage->imageArray_[offset].position[2] = m1->getObjectPtr()->position[2];
+
+            //messageImage->imageArray_[offset].quaternion[0] = m1->getObjectPtr()->quaternion[0];
+            //messageImage->imageArray_[offset].quaternion[1] = m1->getObjectPtr()->quaternion[1];
+            //messageImage->imageArray_[offset].quaternion[2] = m1->getObjectPtr()->quaternion[2];
+            //messageImage->imageArray_[offset].quaternion[3] = m1->getObjectPtr()->quaternion[3];
+
+            messageImage->imageArray_[offset].read_dir[0] = m1->getObjectPtr()->read_dir[0];
+            messageImage->imageArray_[offset].read_dir[1] = m1->getObjectPtr()->read_dir[1];
+            messageImage->imageArray_[offset].read_dir[2] = m1->getObjectPtr()->read_dir[2];
+
+            messageImage->imageArray_[offset].phase_dir[0] = m1->getObjectPtr()->phase_dir[0];
+            messageImage->imageArray_[offset].phase_dir[1] = m1->getObjectPtr()->phase_dir[1];
+            messageImage->imageArray_[offset].phase_dir[2] = m1->getObjectPtr()->phase_dir[2];
+
+            messageImage->imageArray_[offset].slice_dir[0] = m1->getObjectPtr()->slice_dir[0];
+            messageImage->imageArray_[offset].slice_dir[1] = m1->getObjectPtr()->slice_dir[1];
+            messageImage->imageArray_[offset].slice_dir[2] = m1->getObjectPtr()->slice_dir[2];
+
+            messageImage->imageArray_[offset].patient_table_position[0] = m1->getObjectPtr()->patient_table_position[0];
+            messageImage->imageArray_[offset].patient_table_position[1] = m1->getObjectPtr()->patient_table_position[1];
+            messageImage->imageArray_[offset].patient_table_position[2] = m1->getObjectPtr()->patient_table_position[2];
+
+            messageImage->imageArray_[offset].average = m1->getObjectPtr()->idx.average;
+            messageImage->imageArray_[offset].slice = m1->getObjectPtr()->idx.slice;
+            messageImage->imageArray_[offset].contrast = m1->getObjectPtr()->idx.contrast;
+            messageImage->imageArray_[offset].phase = m1->getObjectPtr()->idx.phase;
+            messageImage->imageArray_[offset].repetition = m1->getObjectPtr()->idx.repetition;
+            messageImage->imageArray_[offset].set = m1->getObjectPtr()->idx.set;
+
+            messageImage->imageArray_[offset].acquisition_time_stamp = m1->getObjectPtr()->acquisition_time_stamp;
+
+            messageImage->imageArray_[offset].physiology_time_stamp[0] = m1->getObjectPtr()->physiology_time_stamp[0];
+            messageImage->imageArray_[offset].physiology_time_stamp[1] = m1->getObjectPtr()->physiology_time_stamp[1];
+            messageImage->imageArray_[offset].physiology_time_stamp[2] = m1->getObjectPtr()->physiology_time_stamp[2];
+
+            messageImage->imageArray_[offset].image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+
+            messageImage->imageArray_[offset].image_type = ISMRMRD::TYPE_MAGNITUDE;
+
+            messageImage->imageArray_[offset].image_index = ++image_counter_;
+            messageImage->imageArray_[offset].image_series_index = image_series_;
+
+            // need to store the free user parameters
+            memcpy(messageImage->imageArray_[offset].user_int, m1->getObjectPtr()->user_int, sizeof(int32_t)*8);
+            memcpy(messageImage->imageArray_[offset].user_float, m1->getObjectPtr()->user_float, sizeof(float)*8);
+        }
+
+        // whether or not this acq is the first in a slice, we need to fill the TimeStamps and PMUTimeStamps
+        messageImage->imageArray_[offset].time_stamps[idx.kspace_encode_step_1] = m1->getObjectPtr()->acquisition_time_stamp;
+        messageImage->imageArray_[offset].pmu_time_stamps[idx.kspace_encode_step_1] = m1->getObjectPtr()->physiology_time_stamp[0];
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorGadget::fillImageInfo(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusAccumulatorGadget::close(unsigned long flags)
+{
+    if ( !triggered_ )
+    {
+        triggered_ = true;
+
+        GADGET_MSG("GtPlusAccumulatorGadget - trigger next gadget ... ");
+
+        GadgetContainerMessage<GadgetMessageImageArray>* cm1 = 
+            new GadgetContainerMessage<GadgetMessageImageArray>();
+
+        GadgetContainerMessage< KSpaceBuffer >* cm2 = 
+            new GadgetContainerMessage< KSpaceBuffer >();
+
+        cm1->cont(cm2);
+
+        // copy the image content
+        cm2->getObjectPtr()->buffer_ = kspaceBuffer_->buffer_;
+        cm2->getObjectPtr()->reflect_ = kspaceBuffer_->reflect_;
+
+        // copy the message image array
+        cm1->getObjectPtr()->copy(*messageImage_);
+
+        if (!refBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorGadget - ref signal found : " << refBuffer_.size());
+
+            if ( !fillBuffer(refBuffer_, kspaceBuffer_->ref_, kspaceBuffer_->refReflect_) )
+            {
+                GADGET_DEBUG1("fillBuffer(refBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->ref_ = kspaceBuffer_->ref_;
+            cm2->getObjectPtr()->refReflect_ = kspaceBuffer_->refReflect_;
+        }
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, kspaceBuffer_->phaseCorr_, kspaceBuffer_->phaseCorrReflect_) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = kspaceBuffer_->phaseCorr_;
+            cm2->getObjectPtr()->phaseCorrReflect_ = kspaceBuffer_->phaseCorrReflect_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, kspaceBuffer_->noise_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->noise_ = kspaceBuffer_->noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(otherBuffer_, kspaceBuffer_->other_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->other_ = kspaceBuffer_->other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return GADGET_FAIL;
+        }
+    }
+
+    return BaseClass::close(flags);
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorGadget.h b/gadgets/gtPlus/GtPlusAccumulatorGadget.h
new file mode 100644
index 0000000..e45ecee
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorGadget.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <complex>
+#include "GtPlusExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+// the buffered kspace is defined by the ISMRM 10 dimensions
+// readout
+// kspace_encode_step_1
+// kspace_encode_step_2
+// average
+// slice
+// contrast
+// phase
+// repetition
+// set
+// segment
+// in the order of [RO E1 CHA AVE SLC E2 CON PHS REP SET SEG]
+
+namespace Gadgetron
+{
+
+struct  EXPORTGTPLUS GadgetMessageImageExt : public ISMRMRD::ImageHeader
+{
+    // fields added to store the time_stamp and pmu_time_stamp for every incoming read-out line
+    // if one line is not acquried, the corresponding time is -1
+    std::vector<int>     time_stamps;
+    std::vector<int>     pmu_time_stamps;
+
+    GadgetMessageImageExt();
+    ~GadgetMessageImageExt();
+
+    void copy(GadgetMessageImageExt& aMessageImage);
+    void set_matrix_size(unsigned int index, ACE_UINT16 size);
+    void dump();
+}; 
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+//   0  1  2   3  4  5    6     7   8   9
+// store a scan with 10 dimensions
+struct  EXPORTGTPLUS GadgetMessageImageArray
+{
+    // size of the image array
+    ACE_UINT16 matrix_size[10];
+
+    // message information for every 2D image [Slice E2 Contrast Phase Rep Set Seg]
+    GadgetMessageImageExt* imageArray_;
+
+    GadgetMessageImageArray();
+    GadgetMessageImageArray(int aSize[10]);
+    ~GadgetMessageImageArray();
+
+    void resize(int aSize[10]);
+    void copy(GadgetMessageImageArray& imageArray);
+    int get_offset(int slc, int par, int eco, int phs, int rep, int set, int seg);
+    void extractMessageImageArrayForSLC(int slc, GadgetMessageImageArray& imageArray);
+    void extractMessageImageArrayForREP(int rep, GadgetMessageImageArray& imageArray);
+
+    void dump();
+};
+
+struct EXPORTGTPLUS KSpaceBuffer
+{
+    typedef hoNDArray< std::complex<float> > BufferType;
+    typedef hoNDArray< unsigned short > ReflectBufferType;
+
+    // reflect buffer shows whether a readouline is reflected or not
+
+    // kspace data
+    BufferType buffer_;
+    ReflectBufferType reflect_;
+
+    // reference ACS data
+    BufferType ref_;
+    ReflectBufferType refReflect_;
+
+    // noise data
+    BufferType noise_;
+
+    // phase correction data
+    BufferType phaseCorr_;
+    ReflectBufferType phaseCorrReflect_;
+
+    // other data, e.g. AIF data
+    BufferType other_;
+
+    // properties of kspace
+    // kspace center readout number
+    unsigned int kSpaceCentreRO_;
+    // kspace center number for the first encoding dimension
+    unsigned int kSpaceCentreEncode1_;
+    // kspace center number for the second encoding dimension
+    unsigned int kSpaceCentreEncode2_;
+
+    // kspace max acquired readout number
+    unsigned int kSpaceMaxRO_;
+    // kspace max acquired number for the first encoding dimension
+    unsigned int kSpaceMaxEncode1_;
+    // kspace max acquired number for the second encoding dimension
+    unsigned int kSpaceMaxEncode2_;
+
+    // acceleration rate along the E1 and E2 dimensions
+    unsigned int AccelFactE1_;
+    unsigned int AccelFactE2_;
+
+    // mode of calibration
+    ISMRMRD::calibrationModeType::value CalibMode_;
+    ISMRMRD::interleavingDimensionType::value InterleaveDim_;
+
+    KSpaceBuffer();
+    ~KSpaceBuffer();
+};
+
+// -----------------------------------------------------------------------------------------------------------
+
+struct ReadOutBuffer
+{
+    ISMRMRD::AcquisitionHeader acqHead_;
+    hoNDArray< std::complex<float> > data_;
+    bool isReflect_;
+};
+
+class EXPORTGTPLUS GtPlusAccumulatorGadget : public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< ValueType > > BaseClass;
+
+    typedef std::vector< ReadOutBuffer > ReadOutBufferType;
+    typedef hoNDArray< std::complex<float> > BufferType;
+    typedef hoNDArray< unsigned short > ReflectBufferType;
+
+    GtPlusAccumulatorGadget();
+    ~GtPlusAccumulatorGadget();
+
+    virtual int close(unsigned long flags);
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1, GadgetContainerMessage< hoNDArray< ValueType > > * m2);
+
+    // check the status of incoming readout
+    // bIsKSpace: whether this data is for image
+    // bIsRef: whether this data is for calibration signal
+    // bIsNoise: whether this data is a noise scan
+    // bIsPhaseCorr: whether this data is for phase correction
+    // bIsReflect: whether this data is acquired reflectly (for EPI and similar scans)
+    // bIsOther: other scans
+    virtual bool checkStatus(uint64_t flag, int samples, bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther);
+
+    // store the image data
+    virtual bool storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect);
+
+    // fill the dynamically buffered data
+    virtual bool fillBuffer(ReadOutBufferType& readOutBuffer, BufferType& buf, ReflectBufferType& reflectBuf);
+
+    // fill the per 2D image info
+    virtual bool fillImageInfo(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetMessageImageArray* messageImage, const ISMRMRD::EncodingCounters& idx);
+
+    // buffer for per 2D image information
+    GadgetMessageImageArray* messageImage_;
+
+    // buffer for image kspace data
+    // if the partial fourier is used, the kspace center is put at the center of buffer
+    // this means zeros will be padded accordingly
+    KSpaceBuffer* kspaceBuffer_;
+
+    // dynamic buffer for other kspace data
+    ReadOutBufferType refBuffer_;
+    ReadOutBufferType noiseBuffer_;
+    ReadOutBufferType phaseCorrBuffer_;
+    ReadOutBufferType otherBuffer_;
+
+    // dimension for image kspace
+    std::vector<unsigned int> dimensions_;
+
+    // filed of view [mm]
+    float field_of_view_[3];
+
+    int image_counter_;
+    int image_series_;
+
+    // whether the next gadget has been triggered
+    bool triggered_;
+
+    int meas_max_ro_;
+    ISMRMRD::EncodingCounters meas_max_idx_;
+    int meas_max_channel_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.cpp
new file mode 100644
index 0000000..0cf2b4f
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.cpp
@@ -0,0 +1,615 @@
+
+#include "GtPlusAccumulatorIRT2DGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron
+{
+
+GtPlusAccumulatorIRT2DGadget::GtPlusAccumulatorIRT2DGadget() : prev_rep_(-1), cur_rep_(-1), num_scan_buffered_(0)
+{
+
+}
+
+GtPlusAccumulatorIRT2DGadget::~GtPlusAccumulatorIRT2DGadget()
+{
+
+}
+
+int GtPlusAccumulatorIRT2DGadget::process_config(ACE_Message_Block* mb)
+{
+    return BaseClass::process_config(mb);
+}
+
+bool GtPlusAccumulatorIRT2DGadget::
+copyBufferForREP(BufferType& buf, int rep, BufferType& bufREP)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<unsigned int> > dims = buf.get_dimensions();
+
+        boost::shared_ptr< std::vector<unsigned int> > dimsREP = dims;
+        (*dimsREP)[7] = 1;
+
+        try
+        {
+            bufREP.create(dimsREP);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Failed create buffer for REP \n");
+            return false;
+        }
+
+        // copy the memory over
+        int RO = (*dims)[0];
+        int E1 = (*dims)[1];
+        int CHA = (*dims)[2];
+        int SLC = (*dims)[3];
+        int E2 = (*dims)[4];
+        int CON = (*dims)[5];
+        int PHS = (*dims)[6];
+        int REP = (*dims)[7];
+        int SET = (*dims)[8];
+        int SEG = (*dims)[9];
+
+        int e2, con, phs, slc, set, seg;
+
+        std::vector<unsigned int> pos(10);
+
+        for ( seg=0; seg<SEG; seg++ )
+        {
+            for ( set=0; set<SET; set++ )
+            {
+                for ( slc=0; slc<SLC; slc++ )
+                {
+                    for ( phs=0; phs<PHS; phs++ )
+                    {
+                        for ( con=0; con<CON; con++ )
+                        {
+                            for ( e2=0; e2<E2; e2++ )
+                            {
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = slc;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = rep;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBuffer = buf.calculate_offset(pos);
+
+                                // buffer slc
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = slc;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = 0;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBufferREP = bufREP.calculate_offset(pos);
+
+                                // copy the image content
+                                memcpy(bufREP.begin()+offsetBufferREP, buf.begin()+offsetBuffer, sizeof(std::complex<float>)*RO*E1*CHA);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorIRT2DGadget::copyBufferForREP(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorIRT2DGadget::
+copyReflectBufferForREP(ReflectBufferType& buf, int rep, ReflectBufferType& bufREP)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<unsigned int> > dims = buf.get_dimensions();
+
+        boost::shared_ptr< std::vector<unsigned int> > dimsREP = dims;
+        (*dimsREP)[7] = 1;
+
+        try
+        {
+            bufREP.create(dimsREP);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Failed create buffer for REP \n");
+            return false;
+        }
+
+        // copy the memory over
+        int RO = (*dims)[0];
+        int E1 = (*dims)[1];
+        int CHA = (*dims)[2];
+        int SLC = (*dims)[3];
+        int E2 = (*dims)[4];
+        int CON = (*dims)[5];
+        int PHS = (*dims)[6];
+        int REP = (*dims)[7];
+        int SET = (*dims)[8];
+        int SEG = (*dims)[9];
+
+        int e2, con, phs, slc, set, seg;
+
+        std::vector<unsigned int> pos(10);
+
+        for ( seg=0; seg<SEG; seg++ )
+        {
+            for ( set=0; set<SET; set++ )
+            {
+                for ( slc=0; slc<SLC; slc++ )
+                {
+                    for ( phs=0; phs<PHS; phs++ )
+                    {
+                        for ( con=0; con<CON; con++ )
+                        {
+                            for ( e2=0; e2<E2; e2++ )
+                            {
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = slc;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = rep;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBuffer = buf.calculate_offset(pos);
+
+                                // buffer slc
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = slc;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = 0;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBufferREP = bufREP.calculate_offset(pos);
+
+                                // copy the image content
+                                memcpy(bufREP.begin()+offsetBufferREP, buf.begin()+offsetBuffer, sizeof(unsigned short)*RO*E1*CHA);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorSLCGadget::copyReflectBufferForSLC(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorIRT2DGadget::triggerREP(int rep)
+{
+    try
+    {
+        GadgetContainerMessage<GadgetMessageImageArray>* cm1 = 
+            new GadgetContainerMessage<GadgetMessageImageArray>();
+
+        GadgetContainerMessage< KSpaceBuffer >* cm2 = 
+            new GadgetContainerMessage< KSpaceBuffer >();
+
+        cm1->cont(cm2);
+
+        // copy the kspace data for this REP
+        if ( !copyBufferForREP(kspaceBuffer_->buffer_, 0, cm2->getObjectPtr()->buffer_) ) 
+        {
+            GADGET_DEBUG1("Unable to copyBufferForREP\n");
+            cm1->release();
+            return false;
+        }
+
+        if ( !copyReflectBufferForREP(kspaceBuffer_->reflect_, 0, cm2->getObjectPtr()->reflect_) ) 
+        {
+            GADGET_DEBUG1("Unable to copyReflectBufferForREP\n");
+            cm1->release();
+            return false;
+        }
+
+        // fill buffer with zeros, ready for next REP
+        kspaceBuffer_->buffer_.fill(0);
+        kspaceBuffer_->reflect_.fill(0);
+
+        // copy the message image array for this REP
+        GadgetMessageImageArray aMessageArray;
+        messageImage_->extractMessageImageArrayForREP(0, aMessageArray);
+        cm1->getObjectPtr()->copy(aMessageArray);
+
+        if (!refBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorIRT2DGadget - ref signal found : " << refBuffer_.size());
+
+            BufferType refCurr;
+            ReflectBufferType refReflectCurr;
+            if ( !fillBuffer(refBuffer_, refCurr, refReflectCurr) )
+            {
+                GADGET_DEBUG1("fillBuffer(refBuffer_, refCurr, refReflectCurr) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            if ( !copyBufferForREP(refCurr, rep, cm2->getObjectPtr()->ref_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForREP(refCurr, rep, cm2->getObjectPtr()->ref_)\n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyReflectBufferForREP(refReflectCurr, rep, cm2->getObjectPtr()->refReflect_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyReflectBufferForREP(refReflectCurr, rep, cm2->getObjectPtr()->refReflect_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorIRT2DGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            BufferType phsCorrCurr;
+            ReflectBufferType phsCorrReflectCurr;
+            if ( !fillBuffer(phaseCorrBuffer_, phsCorrCurr, phsCorrReflectCurr) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_, phsCorrCurr, phsCorrReflectCurr) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            if ( !copyBufferForREP(phsCorrCurr, rep, cm2->getObjectPtr()->phaseCorr_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForREP(phsCorrCurr, rep, cm2->getObjectPtr()->phaseCorr_)\n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyReflectBufferForREP(phsCorrReflectCurr, rep, cm2->getObjectPtr()->phaseCorrReflect_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyReflectBufferForREP(phsCorrReflectCurr, rep, cm2->getObjectPtr()->phaseCorrReflect_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorIRT2DGadget - noise signal found : " << noiseBuffer_.size());
+
+            BufferType noiseCurr;
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, noiseCurr, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_, noiseCurr, tmpBuf) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyBufferForREP(noiseCurr, rep, cm2->getObjectPtr()->noise_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForREP(noiseCurr, rep, cm2->getObjectPtr()->noise_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorIRT2DGadget - other signal found : " << otherBuffer_.size());
+
+            BufferType otherCurr;
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(otherBuffer_, otherCurr, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_, otherCurr, tmpBuf) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyBufferForREP(otherCurr, rep, cm2->getObjectPtr()->other_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(otherCurr, rep, cm2->getObjectPtr()->other_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorIRT2DGadget::triggerREP(rep) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorIRT2DGadget::
+storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect)
+{
+    try
+    {
+        unsigned int ii;
+        int samples =  m1->getObjectPtr()->number_of_samples;
+        ISMRMRD::EncodingCounters idx = m1->getObjectPtr()->idx;
+
+        if ( kspaceBuffer_->buffer_.get_number_of_elements() <= 0 )
+        {
+            meas_max_channel_ = m1->getObjectPtr()->active_channels;
+
+            int E1 = 2*kspaceBuffer_->kSpaceCentreEncode1_;
+            int E2 = 2*kspaceBuffer_->kSpaceCentreEncode2_;
+
+            // find the loop counter boundary and allocate the buffer
+            GADGET_MSG("[RO E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+                               << meas_max_ro_ 
+                               << " " << E1 
+                               << " " << meas_max_channel_ 
+                               << " " << meas_max_idx_.slice+1 
+                               << " " << E2 
+                               << " " << meas_max_idx_.contrast+1 
+                               << " " << meas_max_idx_.phase+1 
+                               << " " << 1 
+                               << " " << meas_max_idx_.set+1 
+                               << " " << meas_max_idx_.segment+1 << "]");
+
+            dimensions_.push_back(meas_max_ro_);
+            dimensions_.push_back(E1);
+            dimensions_.push_back(meas_max_channel_);
+            dimensions_.push_back(meas_max_idx_.slice+1);
+            dimensions_.push_back(E2);
+            dimensions_.push_back(meas_max_idx_.contrast+1);
+            dimensions_.push_back(meas_max_idx_.phase+1);
+            dimensions_.push_back(1);
+            dimensions_.push_back(meas_max_idx_.set+1);
+            dimensions_.push_back(meas_max_idx_.segment+1);
+
+            unsigned int N = dimensions_.size();
+            for ( ii=0; ii<N; ii++ )
+            {
+                GADGET_MSG("dimensions_[" << ii << "] = " << dimensions_[ii]);
+            }
+
+            // allocate data buffer
+            try
+            {
+                kspaceBuffer_->buffer_.create(&dimensions_);
+
+                std::vector<unsigned int> reflect_dimensions_(dimensions_);
+                reflect_dimensions_[0] = 1;
+                reflect_dimensions_[2] = 1;
+                kspaceBuffer_->reflect_.create(&reflect_dimensions_);
+            }
+            catch(...)
+            {
+                GADGET_DEBUG1("Failed create buffer\n");
+                return false;
+            }
+
+            // allocate message buffer
+            int matrix_size[10];
+            for ( ii=0; ii<10; ii++ )
+            {
+                matrix_size[ii] = dimensions_[ii];
+            }
+
+            if (!(messageImage_ = new GadgetMessageImageArray(matrix_size))) 
+            {
+                GADGET_DEBUG1("Failed create buffer\n");
+                return false;
+            }
+        }
+
+        std::complex<float>* b = kspaceBuffer_->buffer_.begin();
+        std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+        if (samples != static_cast<int>(dimensions_[0])) 
+        {
+            GADGET_DEBUG1("Wrong number of samples received\n");
+            return false;
+        }
+
+        //Copy the data for all the channels
+        std::vector<unsigned int> pos(10);
+        for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) 
+        {
+            pos[0] = 0;
+            pos[1] = idx.kspace_encode_step_1;
+            pos[2] = c;
+            pos[3] = idx.slice;
+            pos[4] = idx.kspace_encode_step_2;
+            pos[5] = idx.contrast;
+            pos[6] = idx.phase;
+            pos[7] = 0;
+            pos[8] = idx.set;
+            pos[9] = idx.segment;
+            int offsetBuffer = kspaceBuffer_->buffer_.calculate_offset(pos);
+
+            memcpy(b+offsetBuffer, d+c*samples, sizeof(std::complex<float>)*samples);
+
+            pos[2] = 0;
+            offsetBuffer = kspaceBuffer_->reflect_.calculate_offset(pos);
+            kspaceBuffer_->reflect_.at(offsetBuffer) = isReflect;
+        }
+
+        idx.repetition = 0;
+        if ( !fillImageInfo(m1, messageImage_, idx) )
+        {
+            GADGET_DEBUG1("Failed in fillImageInfo(m1, messageImage_, idx)\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorIRT2DGadget::storeImageData(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusAccumulatorIRT2DGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, 
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // check whether a new REP starts
+    bool isLastScanInSlice = ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+    bool isNewRep = false;
+    cur_rep_ = m1->getObjectPtr()->idx.repetition;
+
+    if ( prev_rep_==-1 )
+    {
+        prev_rep_ = cur_rep_;
+    }
+    else
+    {
+        if ( cur_rep_!=prev_rep_ )
+        {
+            isNewRep = true;
+        }
+    }
+
+    bool bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther;
+    if ( !checkStatus(m1->getObjectPtr()->flags, m1->getObjectPtr()->number_of_samples, bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther) )
+    {
+        GADGET_DEBUG1("Failed check readout status\n");
+        return GADGET_FAIL;
+    }
+
+    // store kspace read out
+    if ( bIsKSpace )
+    {
+        if ( !storeImageData(m1, m2, bIsReflect) )
+        {
+            GADGET_DEBUG1("Failed check readout status\n");
+            return GADGET_FAIL;
+        }
+
+        num_scan_buffered_++;
+    }
+
+    // store ref read out
+    if ( bIsRef )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+
+        if ( isNewRep )
+        {
+            refBuffer_.clear();
+        }
+
+        refBuffer_.push_back(item);
+    }
+
+    // store phaseCorr read out
+    if ( bIsPhaseCorr )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+
+        if ( isNewRep )
+        {
+            phaseCorrBuffer_.clear();
+        }
+
+        phaseCorrBuffer_.push_back(item);
+    }
+
+    // store noise read out
+    if ( bIsNoise )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+
+        if ( isNewRep )
+        {
+            noiseBuffer_.clear();
+        }
+
+        noiseBuffer_.push_back(item);
+    }
+
+    // store other read out
+    if ( bIsOther )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+
+        if ( isNewRep )
+        {
+            otherBuffer_.clear();
+        }
+
+        otherBuffer_.push_back(item);
+    }
+
+   // if a new rep comes, it indicates the previous one is complete and can be sent out
+    if ( isLastScanInSlice )
+    {
+        // GADGET_MSG("Repetition " << prev_rep_ << " is complete ... ");
+
+        if ( !triggerREP(prev_rep_) ) 
+        {
+            GADGET_DEBUG1("Unable to trigger this rep ... \n");
+            return GADGET_FAIL;
+        }
+
+        prev_rep_ = cur_rep_;
+
+        GADGET_ERROR_MSG("GtPlusAccumulatorIRT2DGadget - trigger next gadget for REP " << prev_rep_ << " - scan buffered - " << num_scan_buffered_ << " ... ");
+        num_scan_buffered_ = 0;
+    }
+
+    m1->release();
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorIRT2DGadget)
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.h b/gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.h
new file mode 100644
index 0000000..51e2270
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorIRT2DGadget.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "GtPlusAccumulatorGadget.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUS GtPlusAccumulatorIRT2DGadget : public GtPlusAccumulatorGadget
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorIRT2DGadget);
+
+    typedef GtPlusAccumulatorGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+    typedef BaseClass::ReadOutBufferType ReadOutBufferType;
+    typedef BaseClass::BufferType BufferType;
+    typedef BaseClass::ReflectBufferType ReflectBufferType;
+
+    GtPlusAccumulatorIRT2DGadget();
+    ~GtPlusAccumulatorIRT2DGadget();
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+
+    // here, every 2D kspace is stored and send out for every new repetition
+    virtual bool storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect);
+
+    virtual bool triggerREP(int rep);
+    virtual int process(Gadgetron::GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1, Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > > * m2);
+
+    virtual bool copyBufferForREP(BufferType& buf, int rep, BufferType& bufREP);
+    virtual bool copyReflectBufferForREP(ReflectBufferType& buf, int rep, ReflectBufferType& bufREP);
+
+    int prev_rep_;
+    int cur_rep_;
+
+    int num_scan_buffered_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.cpp
new file mode 100644
index 0000000..cf1ab36
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.cpp
@@ -0,0 +1,55 @@
+#include "GtPlusAccumulatorPerfAIFGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron
+{
+
+GtPlusAccumulatorPerfAIFGadget::GtPlusAccumulatorPerfAIFGadget() : cur_rep_(0)
+{
+
+}
+
+GtPlusAccumulatorPerfAIFGadget::~GtPlusAccumulatorPerfAIFGadget()
+{
+
+}
+
+int GtPlusAccumulatorPerfAIFGadget::process_config(ACE_Message_Block* mb)
+{
+    return BaseClass::process_config(mb);
+}
+
+int GtPlusAccumulatorPerfAIFGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, 
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    bool bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther;
+    if ( !checkStatus(m1->getObjectPtr()->flags, m1->getObjectPtr()->number_of_samples, bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther) )
+    {
+        GADGET_DEBUG1("Failed check readout status\n");
+        return GADGET_FAIL;
+    }
+
+    // Last scan for measurement of the first slice can indicate the number of repetition
+    bool is_last_scan_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+    if ( is_last_scan_in_slice && m1->getObjectPtr()->idx.slice==0 && !bIsOther )
+    {
+        GADGET_MSG("Repetition " << cur_rep_ << " is complete ... ");
+        cur_rep_++;
+    }
+
+    BaseClass::process(m1, m2);
+
+    // if the other data is stored, need to correct the repetition
+    if ( bIsOther )
+    {
+        if ( !otherBuffer_.empty() )
+        {
+            otherBuffer_[otherBuffer_.size()-1].acqHead_.idx.repetition = cur_rep_;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorPerfAIFGadget)
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.h b/gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.h
new file mode 100644
index 0000000..48bf802
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorPerfAIFGadget.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "GtPlusAccumulatorGadget.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUS GtPlusAccumulatorPerfAIFGadget : public GtPlusAccumulatorGadget
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorPerfAIFGadget);
+
+    typedef GtPlusAccumulatorGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+    typedef BaseClass::ReadOutBufferType ReadOutBufferType;
+    typedef BaseClass::BufferType BufferType;
+    typedef BaseClass::ReflectBufferType ReflectBufferType;
+
+    GtPlusAccumulatorPerfAIFGadget();
+    ~GtPlusAccumulatorPerfAIFGadget();
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+
+    virtual int process(Gadgetron::GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1, Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > > * m2);
+
+    int cur_rep_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorSLCGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorSLCGadget.cpp
new file mode 100644
index 0000000..1ce603c
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorSLCGadget.cpp
@@ -0,0 +1,403 @@
+#include "GtPlusAccumulatorSLCGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron
+{
+
+GtPlusAccumulatorSLCGadget::GtPlusAccumulatorSLCGadget() : prev_slc_(-1), cur_slc_(-1)
+{
+
+}
+
+GtPlusAccumulatorSLCGadget::~GtPlusAccumulatorSLCGadget()
+{
+
+}
+
+int GtPlusAccumulatorSLCGadget::process_config(ACE_Message_Block* mb)
+{
+    return BaseClass::process_config(mb);
+}
+
+bool GtPlusAccumulatorSLCGadget::
+copyBufferForSLC(BufferType& buf, int slc, BufferType& bufSLC)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<unsigned int> > dims = buf.get_dimensions();
+
+        boost::shared_ptr< std::vector<unsigned int> > dimsSLC = dims;
+        (*dimsSLC)[3] = 1;
+
+        try
+        {
+            bufSLC.create(dimsSLC);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Failed create buffer for SLC \n");
+            return false;
+        }
+
+        // copy the memory over
+        int RO = (*dims)[0];
+        int E1 = (*dims)[1];
+        int CHA = (*dims)[2];
+        int SLC = (*dims)[3];
+        int E2 = (*dims)[4];
+        int CON = (*dims)[5];
+        int PHS = (*dims)[6];
+        int REP = (*dims)[7];
+        int SET = (*dims)[8];
+        int SEG = (*dims)[9];
+
+        int e2, con, phs, rep, set, seg;
+
+        std::vector<unsigned int> pos(10);
+
+        for ( seg=0; seg<SEG; seg++ )
+        {
+            for ( set=0; set<SET; set++ )
+            {
+                for ( rep=0; rep<REP; rep++ )
+                {
+                    for ( phs=0; phs<PHS; phs++ )
+                    {
+                        for ( con=0; con<CON; con++ )
+                        {
+                            for ( e2=0; e2<E2; e2++ )
+                            {
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = slc;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = rep;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBuffer = buf.calculate_offset(pos);
+
+                                // buffer slc
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = 0;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = rep;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBufferSLC = bufSLC.calculate_offset(pos);
+
+                                // copy the image content
+                                memcpy(bufSLC.begin()+offsetBufferSLC, buf.begin()+offsetBuffer, sizeof(std::complex<float>)*RO*E1*CHA);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorSLCGadget::copyBufferForSLC(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorSLCGadget::
+copyReflectBufferForSLC(ReflectBufferType& buf, int slc, ReflectBufferType& bufSLC)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<unsigned int> > dims = buf.get_dimensions();
+
+        boost::shared_ptr< std::vector<unsigned int> > dimsSLC = dims;
+        (*dimsSLC)[3] = 1;
+
+        try
+        {
+            bufSLC.create(dimsSLC);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Failed create buffer for SLC \n");
+            return false;
+        }
+
+        // copy the memory over
+        int RO = (*dims)[0];
+        int E1 = (*dims)[1];
+        int CHA = (*dims)[2];
+        int SLC = (*dims)[3];
+        int E2 = (*dims)[4];
+        int CON = (*dims)[5];
+        int PHS = (*dims)[6];
+        int REP = (*dims)[7];
+        int SET = (*dims)[8];
+        int SEG = (*dims)[9];
+
+        int e2, con, phs, rep, set, seg;
+
+        std::vector<unsigned int> pos(10);
+
+        for ( seg=0; seg<SEG; seg++ )
+        {
+            for ( set=0; set<SET; set++ )
+            {
+                for ( rep=0; rep<REP; rep++ )
+                {
+                    for ( phs=0; phs<PHS; phs++ )
+                    {
+                        for ( con=0; con<CON; con++ )
+                        {
+                            for ( e2=0; e2<E2; e2++ )
+                            {
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = slc;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = rep;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBuffer = buf.calculate_offset(pos);
+
+                                // buffer slc
+                                pos[0] = 0;
+                                pos[1] = 0;
+                                pos[2] = 0;
+                                pos[3] = 0;
+                                pos[4] = e2;
+                                pos[5] = con;
+                                pos[6] = phs;
+                                pos[7] = rep;
+                                pos[8] = set;
+                                pos[9] = seg;
+                                int offsetBufferSLC = bufSLC.calculate_offset(pos);
+
+                                // copy the image content
+                                memcpy(bufSLC.begin()+offsetBufferSLC, buf.begin()+offsetBuffer, sizeof(unsigned short)*RO*E1*CHA);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorSLCGadget::copyReflectBufferForSLC(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorSLCGadget::triggerSLC(int slc)
+{
+    try
+    {
+        GadgetContainerMessage<GadgetMessageImageArray>* cm1 = 
+            new GadgetContainerMessage<GadgetMessageImageArray>();
+
+        GadgetContainerMessage< KSpaceBuffer >* cm2 = 
+            new GadgetContainerMessage< KSpaceBuffer >();
+
+        cm1->cont(cm2);
+
+        // copy the kspace data for this SLC
+        if ( !copyBufferForSLC(kspaceBuffer_->buffer_, slc, cm2->getObjectPtr()->buffer_) ) 
+        {
+            GADGET_DEBUG1("Unable to copyBufferForSLC\n");
+            cm1->release();
+            return false;
+        }
+
+        if ( !copyReflectBufferForSLC(kspaceBuffer_->reflect_, slc, cm2->getObjectPtr()->reflect_) ) 
+        {
+            GADGET_DEBUG1("Unable to copyReflectBufferForSLC\n");
+            cm1->release();
+            return false;
+        }
+
+        // copy the message image array for this SLC
+        GadgetMessageImageArray aMessageArraySLC;
+        messageImage_->extractMessageImageArrayForSLC(slc, aMessageArraySLC);
+        cm1->getObjectPtr()->copy(aMessageArraySLC);
+
+        if (!refBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorSLCGadget - ref signal found : " << refBuffer_.size());
+
+            BufferType refCurr;
+            ReflectBufferType refReflectCurr;
+            if ( !fillBuffer(refBuffer_, refCurr, refReflectCurr) )
+            {
+                GADGET_DEBUG1("fillBuffer(refBuffer_, refCurr, refReflectCurr) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            if ( !copyBufferForSLC(refCurr, slc, cm2->getObjectPtr()->ref_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(refCurr, slc, cm2->getObjectPtr()->ref_)\n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyReflectBufferForSLC(refReflectCurr, slc, cm2->getObjectPtr()->refReflect_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(refReflectCurr, slc, cm2->getObjectPtr()->refReflect_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorSLCGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            BufferType phsCorrCurr;
+            ReflectBufferType phsCorrReflectCurr;
+            if ( !fillBuffer(phaseCorrBuffer_, phsCorrCurr, phsCorrReflectCurr) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_, phsCorrCurr, phsCorrReflectCurr) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            if ( !copyBufferForSLC(phsCorrCurr, slc, cm2->getObjectPtr()->phaseCorr_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(phsCorrCurr, slc, cm2->getObjectPtr()->phaseCorr_)\n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyReflectBufferForSLC(phsCorrReflectCurr, slc, cm2->getObjectPtr()->phaseCorrReflect_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(phsCorrReflectCurr, slc, cm2->getObjectPtr()->phaseCorrReflect_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorSLCGadget - noise signal found : " << noiseBuffer_.size());
+
+            BufferType noiseCurr;
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, noiseCurr, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_, noiseCurr, tmpBuf) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyBufferForSLC(noiseCurr, slc, cm2->getObjectPtr()->noise_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(noiseCurr, slc, cm2->getObjectPtr()->noise_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_MSG("GtPlusAccumulatorSLCGadget - other signal found : " << otherBuffer_.size());
+
+            BufferType otherCurr;
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(otherBuffer_, otherCurr, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_, otherCurr, tmpBuf) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            if ( !copyBufferForSLC(otherCurr, slc, cm2->getObjectPtr()->other_) ) 
+            {
+                GADGET_DEBUG1("Unable to copyBufferForSLC(otherCurr, slc, cm2->getObjectPtr()->other_)\n");
+                cm1->release();
+                return false;
+            }
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in triggerSLC(slc) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusAccumulatorSLCGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, 
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    cur_slc_ = m1->getObjectPtr()->idx.slice;
+
+    BaseClass::process(m1, m2);
+
+    if ( prev_slc_==-1 )
+    {
+        prev_slc_ = cur_slc_;
+    }
+
+   // if a new slice comes, it indicates the previous one is complete and can be sent out
+    if ( cur_slc_!=prev_slc_ )
+    {
+        GADGET_MSG("Slice " << prev_slc_ << " is complete ... ");
+
+        // send out prev slice
+        GADGET_MSG("GtPlusAccumulatorSLCGadget - trigger next gadget for SLC " << prev_slc_ << " ... ");
+        if ( !triggerSLC(prev_slc_) ) 
+        {
+            GADGET_DEBUG1("Unable to trigger this slc ... \n");
+            return GADGET_FAIL;
+        }
+
+        prev_slc_ = cur_slc_;
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusAccumulatorSLCGadget::close(unsigned long flags)
+{
+    // the last slice is still not sent out yet
+    if ( !triggered_ )
+    {
+        GADGET_MSG("GtPlusAccumulatorSLCGadget - trigger next gadget for SLC " << cur_slc_ << " ... ");
+
+        if ( !triggerSLC(cur_slc_) ) 
+        {
+            GADGET_DEBUG1("Unable to trigger this slc ... \n");
+            return GADGET_FAIL;
+        }
+
+        triggered_ = true;
+    }
+
+    // the base class shall do nothing
+    triggered_ = true;
+    return BaseClass::close(flags);
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorSLCGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorSLCGadget.h b/gadgets/gtPlus/GtPlusAccumulatorSLCGadget.h
new file mode 100644
index 0000000..dd9178c
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorSLCGadget.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "GtPlusAccumulatorGadget.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUS GtPlusAccumulatorSLCGadget : public GtPlusAccumulatorGadget
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorSLCGadget);
+
+    typedef GtPlusAccumulatorGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+    typedef BaseClass::ReadOutBufferType ReadOutBufferType;
+    typedef BaseClass::BufferType BufferType;
+    typedef BaseClass::ReflectBufferType ReflectBufferType;
+
+    GtPlusAccumulatorSLCGadget();
+    ~GtPlusAccumulatorSLCGadget();
+
+    virtual int close(unsigned long flags);
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+
+    virtual bool copyBufferForSLC(BufferType& buf, int slc, BufferType& bufSLC);
+    virtual bool copyReflectBufferForSLC(ReflectBufferType& buf, int slc, ReflectBufferType& bufSLC);
+
+    virtual bool triggerSLC(int slc);
+
+    virtual int process(Gadgetron::GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1, Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > > * m2);
+
+    int prev_slc_;
+    int cur_slc_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.cpp b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.cpp
new file mode 100644
index 0000000..79aef32
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.cpp
@@ -0,0 +1,2282 @@
+#include "GtPlusAccumulatorWorkOrderTriggerGadget.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusAccumulatorWorkOrderTriggerGadget::GtPlusAccumulatorWorkOrderTriggerGadget()
+: image_counter_(0), image_series_(100), first_kspace_scan_(true), triggered_in_close_(false), triggered_in_process_(false), triggered_in_process_last_acq_(false), 
+    prev_dim1_(-1), curr_dim1_(-1), prev_dim2_(-1), curr_dim2_(-1), count_dim1_(0), verboseMode_(false), other_kspace_matching_Dim_(DIM_NONE)
+{
+    space_matrix_offset_E1_ = 0;
+    space_matrix_offset_E2_ = 0;
+
+    gtPlusISMRMRDReconUtil<ValueType>().clearAcquisitionHeaderISMRMRD(prev_acq_header_);
+    memset(&meas_max_idx_ref_, 0, sizeof(ISMRMRD::EncodingCounters));
+}
+
+GtPlusAccumulatorWorkOrderTriggerGadget::~GtPlusAccumulatorWorkOrderTriggerGadget()
+{
+
+}
+
+// extract necessary configuration information from the xml
+int GtPlusAccumulatorWorkOrderTriggerGadget::process_config(ACE_Message_Block* mb)
+{
+    // gadget parameters
+    image_series_ = this->get_int_value("image_series");
+
+    noacceleration_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("noacceleration_triggerDim1")));
+    noacceleration_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("noacceleration_triggerDim2")));
+    noacceleration_numOfKSpace_triggerDim1_ = this->get_int_value("noacceleration_numOfKSpace_triggerDim1"); 
+
+    interleaved_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("interleaved_triggerDim1")));
+    interleaved_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("interleaved_triggerDim2")));
+    interleaved_numOfKSpace_triggerDim1_ = this->get_int_value("interleaved_numOfKSpace_triggerDim1"); 
+
+    embedded_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("embedded_triggerDim1")));
+    embedded_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("embedded_triggerDim2")));
+    embedded_numOfKSpace_triggerDim1_ = this->get_int_value("embedded_numOfKSpace_triggerDim1");
+
+    separate_triggerDim1_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("separate_triggerDim1")));
+    separate_triggerDim2_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("separate_triggerDim2")));
+    separate_numOfKSpace_triggerDim1_ = this->get_int_value("separate_numOfKSpace_triggerDim1");
+
+    other_kspace_matching_Dim_ = gtPlus_util_.getISMRMRDDimFromName(*(this->get_string_value("other_kspace_matching_Dim")));
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // ---------------------------------------------------------------------------------------------------------
+    // pass the xml file
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    // seq object
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1)
+    {
+        GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+        GADGET_DEBUG1("This simple GtPlusAccumulatorWorkOrderTriggerGadget only supports one encoding space\n");
+        return GADGET_FAIL;
+    }
+
+    // find out the PAT mode
+    ISMRMRD::ismrmrdHeader::parallelImaging_optional p_imaging_type = cfg->parallelImaging();
+    ISMRMRD::parallelImagingType p_imaging = *p_imaging_type;
+
+    workOrder_.acceFactorE1_ = (size_t)(p_imaging.accelerationFactor().kspace_encoding_step_1());
+    workOrder_.acceFactorE2_ = (size_t)(p_imaging.accelerationFactor().kspace_encoding_step_2());
+    GADGET_CONDITION_MSG(verboseMode_, "acceFactorE1_ is " << workOrder_.acceFactorE1_);
+    GADGET_CONDITION_MSG(verboseMode_, "acceFactorE2_ is " << workOrder_.acceFactorE2_);
+
+    ISMRMRD::calibrationModeType calib = *(p_imaging.calibrationMode());
+    if ( calib == ISMRMRD::calibrationModeType::interleaved )
+    {
+        workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_interleaved;
+        GADGET_CONDITION_MSG(verboseMode_, "Calibration mode is interleaved");
+
+        if ( p_imaging.interleavingDimension().present() )
+        {
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::phase )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_Phase;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::repetition )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_Repetition;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::average )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_Average;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::contrast )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_Contrast;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::other )
+            {
+                workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_other1;
+            }
+
+            GADGET_CONDITION_MSG(verboseMode_, "InterleaveDim is " << gtPlus_util_.getISMRMRDDimName(workOrder_.InterleaveDim_));
+        }
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::embedded )
+    {
+        workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_embedded;
+        GADGET_CONDITION_MSG(verboseMode_, "Calibration mode is embedded");
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::separate )
+    {
+        workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_separate;
+        GADGET_CONDITION_MSG(verboseMode_, "Calibration mode is separate");
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::external )
+    {
+        workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_external;
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::other && workOrder_.acceFactorE1_==1 && workOrder_.acceFactorE2_==1 )
+    {
+        // workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_noacceleration;
+        workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_interleaved;
+        workOrder_.acceFactorE1_=2;
+        workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_Phase;
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::other && (workOrder_.acceFactorE1_>1 || workOrder_.acceFactorE2_>1) )
+    {
+        //workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_other;
+        workOrder_.CalibMode_ = Gadgetron::gtPlus::ISMRMRD_interleaved;
+        workOrder_.acceFactorE1_=2;
+        workOrder_.InterleaveDim_ = Gadgetron::gtPlus::DIM_Phase;
+    }
+
+    // ---------------------------------------------------------------------------------------------------------
+
+    // find out the encoding space 
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    matrix_size_encoding_[0] = e_space.matrixSize().x();
+    matrix_size_encoding_[1] = e_space.matrixSize().y();
+    matrix_size_encoding_[2] = e_space.matrixSize().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Encoding matrix size: " << matrix_size_encoding_[0] << " " << matrix_size_encoding_[1] << " " << matrix_size_encoding_[2]);
+
+    field_of_view_encoding_[0] = e_space.fieldOfView_mm().x();
+    field_of_view_encoding_[1] = e_space.fieldOfView_mm().y();
+    field_of_view_encoding_[2] = e_space.fieldOfView_mm().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Encoding field_of_view : " << field_of_view_encoding_[0] << " " << field_of_view_encoding_[1] << " " << field_of_view_encoding_[2]);
+
+    // find the recon space
+    matrix_size_recon_[0] = r_space.matrixSize().x();
+    matrix_size_recon_[1] = r_space.matrixSize().y();
+    matrix_size_recon_[2] = r_space.matrixSize().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Recon matrix size : " << matrix_size_recon_[0] << " " << matrix_size_recon_[1] << " " << matrix_size_recon_[2]);
+
+    field_of_view_recon_[0] = r_space.fieldOfView_mm().x();
+    field_of_view_recon_[1] = r_space.fieldOfView_mm().y();
+    field_of_view_recon_[2] = r_space.fieldOfView_mm().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Recon field_of_view :  " << field_of_view_recon_[0] << " " << field_of_view_recon_[1] << " " << field_of_view_recon_[2]);
+
+    // ---------------------------------------------------------------------------------------------------------
+    // handle partial fourier
+    //workOrder_.kSpaceCenterEncode1_ = e_limits.kspace_encoding_step_1().get().center();
+    //GADGET_CONDITION_MSG(verboseMode_, "kSpaceCenterEncode1_ is " << workOrder_.kSpaceCenterEncode1_);
+
+    //workOrder_.kSpaceCenterEncode2_ = e_limits.kspace_encoding_step_2().get().center();
+    //GADGET_CONDITION_MSG(verboseMode_, "kSpaceCenterEncode2_ is " << workOrder_.kSpaceCenterEncode2_);
+
+    workOrder_.kSpaceMaxEncode1_ = e_limits.kspace_encoding_step_1().get().maximum();
+    GADGET_CONDITION_MSG(verboseMode_, "kSpaceMaxEncode1_ is " << workOrder_.kSpaceMaxEncode1_);
+
+    workOrder_.kSpaceMaxEncode2_ = e_limits.kspace_encoding_step_2().get().maximum();
+    GADGET_CONDITION_MSG(verboseMode_, "kSpaceMaxEncode2_ is " << workOrder_.kSpaceMaxEncode2_);
+
+    space_size_[1] = workOrder_.kSpaceMaxEncode1_+1;
+    space_size_[2] = workOrder_.kSpaceMaxEncode2_+1;
+
+    // if partial fourier or asymmetric echo is used, correct the kSpaceCenter
+    //if ( space_size_[1]-matrix_size_encoding_[1] > workOrder_.acceFactorE1_ )
+    //{
+    //    GADGET_CONDITION_MSG(verboseMode_, "Partial fourier along E1 ... ");
+    //    //if ( GT_ABS(matrix_size_encoding_[1]/workOrder_.acceFactorE1_ - std::floor(matrix_size_encoding_[1]/workOrder_.acceFactorE1_)) > FLT_EPSILON )
+    //    //{
+    //    //    GADGET_WARN_MSG("matrix_size_[1] is not multiplied by acceFactorE1_ ... ");
+    //    //    matrix_size_encoding_[1] = (std::floor(matrix_size_encoding_[1]/workOrder_.acceFactorE1_)+1)*workOrder_.acceFactorE1_;
+    //    //}
+
+    //    if ( 2*workOrder_.kSpaceCenterEncode1_ > (matrix_size_encoding_[1]+1) )
+    //    {
+    //        space_matrix_offset_E1_ = 0;
+
+    //        workOrder_.start_E2_ = 0;
+    //        workOrder_.end_E2_ = matrix_size_encoding_[1];
+    //    }
+    //    else
+    //    {
+    //        space_matrix_offset_E1_ = space_size_[1] - matrix_size_encoding_[1];
+
+    //        workOrder_.start_E1_ = space_matrix_offset_E1_;
+    //        workOrder_.end_E1_ = workOrder_.kSpaceMaxEncode1_;
+    //    }
+    //}
+    //else
+    //{
+    //    space_matrix_offset_E1_ = 0;
+    //}
+
+    //if ( space_size_[2]-matrix_size_encoding_[2] > workOrder_.acceFactorE2_ )
+    //{
+    //    GADGET_CONDITION_MSG(verboseMode_, "Partial fourier along E2 ... ");
+    //    //if ( GT_ABS(matrix_size_encoding_[2]/workOrder_.acceFactorE2_ - std::floor(matrix_size_encoding_[2]/workOrder_.acceFactorE2_)) > FLT_EPSILON )
+    //    //{
+    //    //    GADGET_WARN_MSG("matrix_size_[2] is not multiplied by acceFactorE2_ ... ");
+    //    //    matrix_size_[2] = (std::floor(matrix_size_[2]/workOrder_.acceFactorE2_)+1)*workOrder_.acceFactorE2_;
+    //    //}
+
+    //    if ( 2*workOrder_.kSpaceCenterEncode2_ > (matrix_size_encoding_[2]+1) )
+    //    {
+    //        space_matrix_offset_E2_ = 0;
+
+    //        workOrder_.start_E2_ = 0;
+    //        workOrder_.end_E2_ = matrix_size_encoding_[2];
+    //    }
+    //    else
+    //    {
+    //        space_matrix_offset_E2_ = space_size_[2] - matrix_size_encoding_[2];
+
+    //        workOrder_.start_E2_ = space_matrix_offset_E2_;
+    //        workOrder_.end_E2_ = workOrder_.kSpaceMaxEncode2_;
+    //    }
+    //}
+    //else
+    //{
+    //    space_matrix_offset_E2_ = 0;
+    //}
+
+    // ---------------------------------------------------------------------------------------------------------
+    // encoding limits
+
+    meas_max_ro_ = e_space.matrixSize().x()/2;
+
+    if (e_limits.kspace_encoding_step_1().present()) 
+    {
+        meas_max_idx_.kspace_encode_step_1 = e_limits.kspace_encoding_step_1().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.kspace_encode_step_1 = 0;
+        std::cout << "Setting number of kspace_encode_step_1 to 0" << std::endl;
+        return GADGET_FAIL;
+    }
+
+    space_size_[0] = meas_max_ro_;
+
+    if (e_limits.set().present())
+    {
+        if ( e_limits.set().get().maximum() > 0 )
+            meas_max_idx_.set = e_limits.set().get().maximum() - 1;
+        else
+            meas_max_idx_.set = 0;
+
+        if ( meas_max_idx_.set < 0 ) meas_max_idx_.set = 0;
+    }
+    else
+    {
+        meas_max_idx_.set = 0;
+    }
+
+    if (e_limits.phase().present())
+    {
+        if ( e_limits.phase().get().maximum() > 0 )
+            meas_max_idx_.phase = e_limits.phase().get().maximum()-1;
+        else
+            meas_max_idx_.phase = 0;
+
+        if ( meas_max_idx_.phase < 0 ) meas_max_idx_.phase = 0;
+    }
+    else
+    {
+        meas_max_idx_.phase = 0;
+    }
+
+    if (e_limits.kspace_encoding_step_2().present())
+    {
+        meas_max_idx_.kspace_encode_step_2 = e_limits.kspace_encoding_step_2().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.kspace_encode_step_2 = 0;
+    }
+
+    if (e_limits.contrast().present())
+    {
+        if ( e_limits.contrast().get().maximum() > 0 )
+            meas_max_idx_.contrast = e_limits.contrast().get().maximum()-1;
+        else
+            meas_max_idx_.contrast = 0;
+
+        if ( meas_max_idx_.contrast < 0 ) meas_max_idx_.contrast = 0;
+    }
+    else
+    {
+        meas_max_idx_.contrast = 0;
+    }
+
+    if (e_limits.slice().present())
+    {
+        meas_max_idx_.slice = e_limits.slice().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.slice = 0;
+    }
+
+    if (e_limits.repetition().present())
+    {
+        meas_max_idx_.repetition = e_limits.repetition().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.repetition = 0;
+    }
+
+    if (e_limits.average().present())
+    {
+        meas_max_idx_.average = e_limits.average().get().maximum()-1;
+    }
+    else
+    {
+        meas_max_idx_.average = 0;
+    }
+
+    if (e_limits.segment().present())
+    {
+        // meas_max_idx_.segment = e_limits.segment().get().maximum()-1;
+        meas_max_idx_.segment = 0;
+    }
+    else
+    {
+        meas_max_idx_.segment = 0;
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusAccumulatorWorkOrderTriggerGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // logic to control whether to store kspace and ref data
+    bool bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther, bIsNavigator, bIsRTFeedback, bIsHPFeedback, bIsDummyScan;
+    if ( !checkStatus(m1->getObjectPtr()->flags, m1->getObjectPtr()->number_of_samples, 
+            bIsKSpace, bIsRef, bIsNoise, bIsPhaseCorr, bIsReflect, bIsOther,
+            bIsNavigator, bIsRTFeedback, bIsHPFeedback, bIsDummyScan) )
+    {
+        GADGET_DEBUG1("Failed check readout status\n");
+        return GADGET_FAIL;
+    }
+
+    size_t scan_counter = m1->getObjectPtr()->scan_counter;
+
+    if ( scan_counter%1000 == 0 )
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "--> receive scan : " << scan_counter);
+    }
+
+    // combine the segmentes
+    m1->getObjectPtr()->idx.segment = 0;
+
+    if ( (bIsNavigator || bIsRTFeedback || bIsHPFeedback || bIsDummyScan) && !bIsKSpace && !bIsRef )
+    {
+        m1->release();
+        return GADGET_OK;
+    }
+
+    if ( !bIsRTFeedback && bIsKSpace && first_kspace_scan_ && m1->getObjectPtr()->center_sample>0 )
+    {
+        if ( (workOrder_.start_RO_<0) && (workOrder_.end_RO_<0) )
+        {
+            gtPlus_util_.findStartEndROAfterZeroFilling(m1->getObjectPtr()->center_sample, m1->getObjectPtr()->number_of_samples, workOrder_.start_RO_, workOrder_.end_RO_);
+
+            GADGET_CONDITION_MSG(verboseMode_, "start_RO : " << workOrder_.start_RO_);
+            GADGET_CONDITION_MSG(verboseMode_, "end_RO : " << workOrder_.end_RO_);
+
+            workOrder_.kSpaceCenterRO_ = m1->getObjectPtr()->center_sample;
+            workOrder_.kSpaceMaxRO_ = m1->getObjectPtr()->number_of_samples;
+        }
+
+        workOrder_.kSpaceCenterEncode1_ = m1->getObjectPtr()->idx.user[5];
+        GADGET_CONDITION_MSG(verboseMode_, "kSpaceCenterEncode1_ is " << workOrder_.kSpaceCenterEncode1_);
+
+        workOrder_.kSpaceCenterEncode2_ = m1->getObjectPtr()->idx.user[6];
+        GADGET_CONDITION_MSG(verboseMode_, "kSpaceCenterEncode2_ is " << workOrder_.kSpaceCenterEncode2_);
+
+        // if partial fourier or asymmetric echo is used, correct the kSpaceCenter
+        if ( space_size_[1]-matrix_size_encoding_[1] > workOrder_.acceFactorE1_ )
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Partial fourier along E1 ... ");
+
+            if ( 2*workOrder_.kSpaceCenterEncode1_ > (matrix_size_encoding_[1]+1) )
+            {
+                space_matrix_offset_E1_ = 0;
+
+                workOrder_.start_E2_ = 0;
+                workOrder_.end_E2_ = matrix_size_encoding_[1];
+            }
+            else
+            {
+                space_matrix_offset_E1_ = space_size_[1] - matrix_size_encoding_[1];
+
+                workOrder_.start_E1_ = space_matrix_offset_E1_;
+                workOrder_.end_E1_ = workOrder_.kSpaceMaxEncode1_;
+            }
+        }
+        else
+        {
+            space_matrix_offset_E1_ = 0;
+        }
+
+        if ( space_size_[2]-matrix_size_encoding_[2] > workOrder_.acceFactorE2_ )
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Partial fourier along E2 ... ");
+
+            if ( 2*workOrder_.kSpaceCenterEncode2_ > (matrix_size_encoding_[2]+1) )
+            {
+                space_matrix_offset_E2_ = 0;
+
+                workOrder_.start_E2_ = 0;
+                workOrder_.end_E2_ = matrix_size_encoding_[2];
+            }
+            else
+            {
+                space_matrix_offset_E2_ = space_size_[2] - matrix_size_encoding_[2];
+
+                workOrder_.start_E2_ = space_matrix_offset_E2_;
+                workOrder_.end_E2_ = workOrder_.kSpaceMaxEncode2_;
+            }
+        }
+        else
+        {
+            space_matrix_offset_E2_ = 0;
+        }
+
+        first_kspace_scan_ = false;
+    }
+
+    // hack for UCL data
+    //if ( bIsKSpace && bIsRef )
+    //{
+    //    if ( m1->getObjectPtr()->idx.kspace_encode_step_1%2 == 1 )
+    //    {
+    //        bIsKSpace = false;
+    //    }
+    //}
+
+    // store kspace read out
+    if ( bIsKSpace )
+    {
+        if ( !storeImageData(m1, m2, bIsReflect) )
+        {
+            GADGET_DEBUG1("Failed check readout status\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // store ref read out
+    if ( bIsRef )
+    {
+        if ( !storeRefData(m1, m2, bIsReflect) )
+        {
+            GADGET_DEBUG1("Failed check readout status\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // store phaseCorr read out
+    if ( bIsPhaseCorr )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        phaseCorrBuffer_.push_back(item);
+    }
+
+    // store noise read out
+    if ( bIsNoise )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        noiseBuffer_.push_back(item);
+    }
+
+    // store other read out
+    if ( bIsOther )
+    {
+        ISMRMRD::AcquisitionHeader* pMDH = m1->getObjectPtr();
+        hoNDArray< ValueType >* pRefLine = m2->getObjectPtr();
+
+        if ( other_kspace_matching_Dim_ != DIM_NONE )
+        {
+            if ( prev_acq_header_.measurement_uid != 0 )
+            {
+                size_t v = getDimValue(prev_acq_header_, other_kspace_matching_Dim_);
+                setDimValue(*pMDH, other_kspace_matching_Dim_, v+1);
+            }
+        }
+
+        ReadOutBuffer item;
+        item.acqHead_ = *pMDH;
+        item.data_ = *pRefLine;
+        item.isReflect_ = bIsReflect;
+        otherBuffer_.push_back(item);
+    }
+
+    // perform triggering
+    if ( !triggerWorkOrder(m1, false, bIsKSpace) )
+    {
+        GADGET_DEBUG1("Failed triggerWorkOrder(m1)\n");
+        return GADGET_FAIL;
+    }
+
+    m1->release();
+    return GADGET_OK;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::needTriggerWorkOrderAllInClose()
+{
+    // already triggered for last acquisition
+    if ( triggered_in_process_last_acq_ ) return false;
+
+    // if never triggered in process(...)
+    if ( !triggered_in_process_ && !triggered_in_process_last_acq_ ) return true;
+
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        return ((interleaved_triggerDim1_==DIM_NONE)&&(interleaved_triggerDim2_==DIM_NONE));
+    }
+    else if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+    {
+        return ((embedded_triggerDim1_==DIM_NONE)&&(embedded_triggerDim2_==DIM_NONE));
+    }
+    else if ( (workOrder_.CalibMode_ == ISMRMRD_separate) 
+            || (workOrder_.CalibMode_ == ISMRMRD_external) )
+    {
+        return ((separate_triggerDim1_==DIM_NONE)&&(separate_triggerDim2_==DIM_NONE));
+    }
+    else if ( (workOrder_.CalibMode_ == ISMRMRD_noacceleration) )
+    {
+        return ((noacceleration_triggerDim1_==DIM_NONE)&&(noacceleration_triggerDim2_==DIM_NONE));
+    }
+    else
+    {
+        GADGET_ERROR_MSG("Unsupported calibration mode : " << workOrder_.CalibMode_);
+        return true;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, bool inClose, bool isKSpace)
+{
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(interleaved_triggerDim1_, interleaved_triggerDim2_, interleaved_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, interleaved_triggerDim1_, interleaved_triggerDim2_, interleaved_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(embedded_triggerDim1_, embedded_triggerDim2_, embedded_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, embedded_triggerDim1_, embedded_triggerDim2_, embedded_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else if ( (workOrder_.CalibMode_ == ISMRMRD_separate) 
+            || (workOrder_.CalibMode_ == ISMRMRD_external) )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(separate_triggerDim1_, separate_triggerDim2_, separate_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, separate_triggerDim1_, separate_triggerDim2_, separate_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else if ( workOrder_.CalibMode_ == ISMRMRD_noacceleration )
+    {
+        if ( inClose )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(noacceleration_triggerDim1_, noacceleration_triggerDim2_, noacceleration_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(triggerWorkOrder(m1, noacceleration_triggerDim1_, noacceleration_triggerDim2_, noacceleration_numOfKSpace_triggerDim1_));
+            }
+        }
+    }
+    else
+    {
+        GADGET_ERROR_MSG("Unsupported calibration mode : " << workOrder_.CalibMode_);
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+resetTriggerStatus(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1)
+{
+    // return !gtPlusISMRMRDReconUtil<ValueType>().hasIdenticalGeometryISMRMRD(*(m1->getObjectPtr()), prev_acq_header_);
+    return false;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+            Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1_, 
+            Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2_,
+            int numOfKSpace_triggerDim1_)
+{
+    //bool is_first_acq_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_FIRST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+    //if ( !is_first_acq_in_slice ) return true;
+
+    bool is_last_acq = ((ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_REPETITION).isSet(m1->getObjectPtr()->flags)) 
+                    || (ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags)) ) 
+                    && (m1->getObjectPtr()->idx.repetition==meas_max_idx_.repetition)
+                    && (m1->getObjectPtr()->idx.slice==meas_max_idx_.slice)
+                    && (m1->getObjectPtr()->idx.set==meas_max_idx_.set)
+                    && (m1->getObjectPtr()->idx.contrast==meas_max_idx_.contrast)
+                    && (m1->getObjectPtr()->idx.phase==meas_max_idx_.phase);
+
+    curr_dim1_ = getDimValue(*(m1->getObjectPtr()), triggerDim1_);
+    curr_dim2_ = getDimValue(*(m1->getObjectPtr()), triggerDim2_);
+
+    if ( is_last_acq 
+            && ( (triggerDim1_!=DIM_NONE) || (triggerDim2_!=DIM_NONE) ) )
+    {
+        GADGET_CONDITION_MSG(true, "Last scan in measurement - " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_ << " - " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+
+        if ( curr_dim1_==0 && curr_dim2_== 0 )
+        {
+            GADGET_CONDITION_MSG(true, "Last scan in measurement - not trigger ... ");
+            return true;
+        }
+
+        triggered_in_process_last_acq_ = true;
+        GADGET_CONDITION_MSG(true, "Last scan in measurement - triggered_in_process_last_acq_ : " << triggered_in_process_last_acq_);
+
+        if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(interleaved_triggerDim1_, interleaved_triggerDim2_, interleaved_numOfKSpace_triggerDim1_));
+        }
+        else if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(embedded_triggerDim1_, embedded_triggerDim2_, embedded_numOfKSpace_triggerDim1_));
+        }
+        else if ( (workOrder_.CalibMode_ == ISMRMRD_separate) 
+                || (workOrder_.CalibMode_ == ISMRMRD_external) )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(separate_triggerDim1_, separate_triggerDim2_, separate_numOfKSpace_triggerDim1_));
+        }
+        else if ( workOrder_.CalibMode_ == ISMRMRD_noacceleration )
+        {
+            GADGET_CHECK_RETURN_FALSE(triggerWorkOrderLastCountInClose(noacceleration_triggerDim1_, noacceleration_triggerDim2_, noacceleration_numOfKSpace_triggerDim1_));
+        }
+        else
+        {
+            triggered_in_process_last_acq_ = false;
+            GADGET_ERROR_MSG("Unsupported calibration mode : " << workOrder_.CalibMode_);
+            return false;
+        }
+
+        return true;
+    }
+
+    if ( prev_dim1_ == -1 )
+    {
+        prev_dim1_ = curr_dim1_;
+        count_dim1_ = 0;
+        GADGET_CONDITION_MSG(verboseMode_, "Current Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_);
+    }
+
+    if ( prev_dim2_ == -1 )
+    {
+        prev_dim2_ = curr_dim2_;
+        count_dim1_ = 0;
+        GADGET_CONDITION_MSG(verboseMode_, "Current Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+    }
+
+    if ( prev_acq_header_.measurement_uid == 0 ) prev_acq_header_ = *(m1->getObjectPtr());
+
+    bool workFlow_BufferKernel_ = false;
+    bool workFlow_use_BufferedKernel_ = false;
+
+    if ( prev_dim1_ != curr_dim1_ )
+    {
+        count_dim1_++;
+        GADGET_CONDITION_MSG(verboseMode_, "Current Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_);
+        GADGET_CONDITION_MSG(verboseMode_, "Current Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+        GADGET_CONDITION_MSG(verboseMode_, "count_dim1_ : " << count_dim1_);
+    }
+
+    if ( (triggerDim1_==DIM_NONE) && (triggerDim2_==DIM_NONE) )
+    {
+        prev_dim1_ = curr_dim1_;
+        prev_dim2_ = curr_dim2_;
+        prev_acq_header_ = *(m1->getObjectPtr());
+        return true;
+    }
+
+    int numOfAcquiredKSpaceForTriggerDim1 = numOfKSpace_triggerDim1_;
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        numOfAcquiredKSpaceForTriggerDim1 = numOfKSpace_triggerDim1_ * workOrder_.acceFactorE1_ * workOrder_.acceFactorE2_;
+    }
+
+    // trigger whenever the Dim2 is changed
+    if (  triggerDim1_==DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        prev_dim1_ = curr_dim1_;
+        prev_acq_header_ = *(m1->getObjectPtr());
+
+        int prev_dim2_local_ = prev_dim2_;
+        prev_dim2_ = curr_dim2_;
+
+        if ( curr_dim2_!= prev_dim2_local_ )
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+            GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            triggered_in_process_ = true;
+        }
+    }
+
+    // trigger whenever the Dim1 is changed
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_==DIM_NONE  )
+    {
+        prev_dim2_ = curr_dim2_;
+
+        int prev_dim1_local_ = prev_dim1_;
+        prev_dim1_ = curr_dim1_;
+
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( curr_dim1_!= prev_dim1_local_ )
+            {
+                if ( resetTriggerStatus(m1) )
+                {
+                    count_dim1_ = 0;
+                    GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+
+                if ( count_dim1_ == numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+
+                    workFlow_BufferKernel_ = true;
+                    workFlow_use_BufferedKernel_ = false;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimLessEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+                else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+        else
+        {
+            if ( curr_dim1_!= prev_dim1_local_ )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                triggered_in_process_ = true;
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+    }
+
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        int prev_dim1_local_ = prev_dim1_;
+        int prev_dim2_local_ = prev_dim2_;
+
+        prev_dim1_ = curr_dim1_;
+        prev_dim2_ = curr_dim2_;
+
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( (curr_dim2_!=prev_dim2_local_) || resetTriggerStatus(m1) )
+            {
+                count_dim1_ = 0;
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                    << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                workFlow_BufferKernel_ = false;
+                workFlow_use_BufferedKernel_ = true;
+
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+
+                triggered_in_process_ = true;
+            }
+
+            if (curr_dim1_!=prev_dim1_local_)
+            {
+                if ( count_dim1_ == numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                        << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                    workFlow_BufferKernel_ = true;
+                    workFlow_use_BufferedKernel_ = false;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDim1LessEqualDim2Equal(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+                else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+                {
+                    GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_ 
+                        << "; Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+
+                    workFlow_BufferKernel_ = false;
+                    workFlow_use_BufferedKernel_ = true;
+                    GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                    triggered_in_process_ = true;
+                }
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+        else
+        {
+            // trigger whenever the Dim2 is changed
+            if ( curr_dim2_!= prev_dim2_local_ )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+                triggered_in_process_ = true;
+            }
+
+            prev_acq_header_ = *(m1->getObjectPtr());
+        }
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerWorkOrderLastCountInClose(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1_, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2_, int numOfKSpace_triggerDim1_)
+{
+    GADGET_CONDITION_MSG(verboseMode_, "Current Dim1 InClose : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << curr_dim1_);
+    GADGET_CONDITION_MSG(verboseMode_, "Current Dim2 InClose : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << curr_dim2_);
+
+    if ( prev_dim1_ != curr_dim1_ )
+    {
+        count_dim1_++;
+    }
+
+    bool workFlow_BufferKernel_ = false;
+    bool workFlow_use_BufferedKernel_ = false;
+
+    int numOfAcquiredKSpaceForTriggerDim1 = numOfKSpace_triggerDim1_;
+    if ( workOrder_.CalibMode_ == ISMRMRD_interleaved )
+    {
+        numOfAcquiredKSpaceForTriggerDim1 = numOfKSpace_triggerDim1_ * workOrder_.acceFactorE1_ * workOrder_.acceFactorE2_;
+    }
+
+    int prev_dim1_local_ = prev_dim1_;
+    int prev_dim2_local_ = prev_dim2_;
+
+    prev_dim1_ = curr_dim1_;
+    prev_dim2_ = curr_dim2_;
+
+    // trigger whenever the Dim2 is changed
+    if (  triggerDim1_==DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+        GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+    }
+
+    // trigger whenever the Dim1 is changed
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_==DIM_NONE  )
+    {
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( count_dim1_ <= numOfAcquiredKSpaceForTriggerDim1 )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " <= " << prev_dim1_local_);
+                workFlow_BufferKernel_ = true;
+                workFlow_use_BufferedKernel_ = false;
+                GADGET_CHECK_RETURN_FALSE(triggerByDimLessEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+            else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+                workFlow_BufferKernel_ = false;
+                workFlow_use_BufferedKernel_ = true;
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+        }
+        else
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+            GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+        }
+    }
+
+    if (  triggerDim1_!=DIM_NONE && triggerDim2_!=DIM_NONE  )
+    {
+        if ( numOfKSpace_triggerDim1_ > 0 )
+        {
+            if ( count_dim1_ <= numOfAcquiredKSpaceForTriggerDim1 ) // no more data will be available, so have to do the recon
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " <= " << prev_dim1_local_);
+                workFlow_BufferKernel_ = true;
+                workFlow_use_BufferedKernel_ = false;
+                GADGET_CHECK_RETURN_FALSE(triggerByDim1LessEqualDim2Equal(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+            else if ( count_dim1_ > numOfAcquiredKSpaceForTriggerDim1 )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim1 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim1_ ) << " = " << prev_dim1_local_);
+                workFlow_BufferKernel_ = false;
+                workFlow_use_BufferedKernel_ = true;
+                GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim1_, prev_dim1_local_, triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+            }
+        }
+        else
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Trigger Dim2 : " << gtPlusISMRMRDReconUtil<ValueType>().getISMRMRDDimName(triggerDim2_ ) << " = " << prev_dim2_local_);
+            GADGET_CHECK_RETURN_FALSE(triggerByDimEqual(triggerDim2_, prev_dim2_local_, workFlow_BufferKernel_, workFlow_use_BufferedKernel_));
+        }
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::checkStatus(uint64_t flag, int samples, 
+    bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther,
+    bool& bIsNavigator, bool& bIsRTFeedback, bool& bIsHPFeedback, bool& bIsDummyScan)
+{
+    bIsNoise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(flag);
+    bool is_ref = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_PARALLEL_CALIBRATION).isSet(flag);
+    bool is_ref_kspace = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_PARALLEL_CALIBRATION_AND_IMAGING).isSet(flag);
+    bIsReflect = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_REVERSE).isSet(flag);
+    bIsPhaseCorr = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_PHASECORR_DATA).isSet(flag);
+    bIsNavigator = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NAVIGATION_DATA).isSet(flag);
+    bIsRTFeedback = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_RTFEEDBACK_DATA).isSet(flag);
+    bIsHPFeedback = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_HPFEEDBACK_DATA).isSet(flag);
+    bIsDummyScan = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_DUMMYSCAN_DATA).isSet(flag);
+
+    bIsKSpace = false;
+    bIsRef = false;
+    bIsOther = false;
+
+    if ( bIsNoise || bIsDummyScan )
+    {
+        return true;
+    }
+
+    if ( workOrder_.CalibMode_==ISMRMRD_noacceleration )
+    {
+        bIsKSpace = true;
+        bIsRef = false;
+    }
+
+    // in interleaved mode, only store the image data
+    if ( workOrder_.CalibMode_==ISMRMRD_interleaved )
+    {
+        bIsKSpace = true;
+        bIsRef = false;
+    }
+
+    // in embedded, kspace stores only the undersampled lines
+    // ref stores all lines used for references
+    if ( workOrder_.CalibMode_==ISMRMRD_embedded )
+    {
+        if ( is_ref && !is_ref_kspace )
+        {
+            bIsKSpace = false;
+            bIsRef = true;
+        }
+
+        if ( !is_ref && is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = true;
+        }
+
+        if ( is_ref && is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = true;
+        }
+
+        if ( !is_ref && !is_ref_kspace )
+        {
+            bIsKSpace = true;
+            bIsRef = false;
+        }
+    }
+
+    // in separate mode
+    if ( workOrder_.CalibMode_==ISMRMRD_separate 
+    || workOrder_.CalibMode_==ISMRMRD_external )
+    {
+        if ( is_ref )
+        {
+            bIsKSpace = false;
+            bIsRef = true;
+        }
+
+        if ( !is_ref )
+        {
+            bIsKSpace = true;
+            bIsRef = false;
+        }
+    }
+
+    // store other data, e.g. AIF
+    // only for tpat
+    if ( !is_ref && !is_ref_kspace && (samples != meas_max_ro_) )
+    {
+        bIsOther = true;
+        bIsKSpace = false;
+        bIsRef = false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect)
+{
+    try
+    {
+        size_t ii;
+        size_t samples =  m1->getObjectPtr()->number_of_samples;
+        ISMRMRD::EncodingCounters idx = m1->getObjectPtr()->idx;
+
+        idx.segment = 0; // combine the segments
+
+        if ( workOrder_.data_.get_number_of_elements() <= 0 )
+        {
+            meas_max_channel_ = m1->getObjectPtr()->active_channels;
+
+            int E1 = workOrder_.kSpaceMaxEncode1_+1;
+            int E2 = workOrder_.kSpaceMaxEncode2_+1;
+            if ( E2 == 0 ) E2 = 1;
+
+            if ( E1 < matrix_size_encoding_[1] ) E1 = matrix_size_encoding_[1];
+            if ( E2 < matrix_size_encoding_[2] ) E2 = matrix_size_encoding_[2];
+
+            // find the loop counter boundary and allocate the buffer
+            GADGET_CONDITION_MSG(verboseMode_, "[RO E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+                               << meas_max_ro_ 
+                               << " " << E1 
+                               << " " << meas_max_channel_ 
+                               << " " << meas_max_idx_.slice+1 
+                               << " " << E2 
+                               << " " << meas_max_idx_.contrast+1 
+                               << " " << meas_max_idx_.phase+1 
+                               << " " << meas_max_idx_.repetition+1 
+                               << " " << meas_max_idx_.set+1 
+                               << " " << meas_max_idx_.segment+1 << "]");
+
+            dimensions_.clear();
+            dimensions_.push_back(meas_max_ro_);
+            dimensions_.push_back(E1);
+            dimensions_.push_back(meas_max_channel_);
+            dimensions_.push_back(meas_max_idx_.slice+1);
+            dimensions_.push_back(E2);
+            dimensions_.push_back(meas_max_idx_.contrast+1);
+            dimensions_.push_back(meas_max_idx_.phase+1);
+            dimensions_.push_back(meas_max_idx_.repetition+1);
+            dimensions_.push_back(meas_max_idx_.set+1);
+            dimensions_.push_back(meas_max_idx_.segment+1);
+
+            size_t N = dimensions_.size();
+            for ( ii=0; ii<N; ii++ )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "dimensions_[" << ii << "] = " << dimensions_[ii]);
+            }
+
+            // allocate data buffer
+            try
+            {
+                workOrder_.data_.create(&dimensions_);
+                Gadgetron::clear(workOrder_.data_);
+
+                std::vector<size_t> reflect_dimensions_(dimensions_);
+                reflect_dimensions_[0] = 1;
+                reflect_dimensions_[2] = 1;
+                workOrder_.reflect_.create(&reflect_dimensions_);
+                Gadgetron::clear(workOrder_.reflect_);
+            }
+            catch(...)
+            {
+                GADGET_DEBUG1("Failed create buffer\n");
+                return false;
+            }
+
+            // allocate message buffer
+            int matrix_size[10];
+            for ( ii=0; ii<10; ii++ )
+            {
+                matrix_size[ii] = dimensions_[ii];
+            }
+
+            if (!(messageImage_ = new GtPlusGadgetImageArray(matrix_size))) 
+            {
+                GADGET_DEBUG1("Failed create buffer\n");
+                return false;
+            }
+        }
+
+        // if necessary, shift the E1/E2 indexes
+        if ( workOrder_.start_E1_ > 0 )
+        {
+            idx.kspace_encode_step_1 += workOrder_.start_E1_;
+        }
+
+        if ( workOrder_.start_E2_ > 0 )
+        {
+            idx.kspace_encode_step_2 += workOrder_.start_E2_;
+        }
+
+        std::complex<float>* b = workOrder_.data_.begin();
+        std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+        if (samples != static_cast<int>(dimensions_[0])) 
+        {
+            GADGET_DEBUG1("Wrong number of samples received\n");
+            return false;
+        }
+
+        //Copy the data for all the channels
+        hoNDArray<std::complex<float> > reflectBuf;
+        if ( isReflect )
+        {
+            reflectBuf.create(samples);
+        }
+
+        std::vector<size_t> pos(10);
+        for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) 
+        {
+            pos[0] = 0;
+            pos[1] = idx.kspace_encode_step_1;
+            pos[2] = c;
+            pos[3] = idx.slice;
+            pos[4] = idx.kspace_encode_step_2;
+            pos[5] = idx.contrast;
+            pos[6] = idx.phase;
+            pos[7] = idx.repetition;
+            pos[8] = idx.set;
+            pos[9] = idx.segment;
+            long long offsetBuffer = workOrder_.data_.calculate_offset(pos);
+
+            if ( isReflect )
+            {
+                for ( int s=0; s<samples; s++ )
+                {
+                    reflectBuf(samples-1-s) = d[c*samples+s];
+                }
+
+                memcpy(b+offsetBuffer, reflectBuf.begin(), sizeof(std::complex<float>)*samples);
+            }
+            else
+            {
+                memcpy(b+offsetBuffer, d+c*samples, sizeof(std::complex<float>)*samples);
+            }
+
+            pos[2] = 0;
+            offsetBuffer = workOrder_.reflect_.calculate_offset(pos);
+            workOrder_.reflect_.at(offsetBuffer) = isReflect;
+        }
+
+        if ( !fillImageInfo(m1, messageImage_, idx) )
+        {
+            GADGET_DEBUG1("Failed in fillImageInfo(m1, messageImage_, idx)\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::storeImageData(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+storeRefData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect)
+{
+    try
+    {
+        size_t ii;
+        size_t samples =  m1->getObjectPtr()->number_of_samples;
+        ISMRMRD::EncodingCounters idx = m1->getObjectPtr()->idx;
+
+        idx.segment = 0; // combine the segments
+
+        if ( workOrder_.ref_.get_number_of_elements() <= 0 )
+        {
+            meas_max_channel_ = m1->getObjectPtr()->active_channels;
+
+            int E1 = workOrder_.kSpaceMaxEncode1_+1;
+            int E2 = workOrder_.kSpaceMaxEncode2_+1;
+            if ( E2 == 0 ) E2 = 1;
+
+            if ( E1 < matrix_size_encoding_[1] ) E1 = matrix_size_encoding_[1];
+            if ( E2 < matrix_size_encoding_[2] ) E2 = matrix_size_encoding_[2];
+
+            size_t RO = meas_max_ro_;
+
+            if ( (samples < meas_max_ro_) 
+                && (( workOrder_.CalibMode_==ISMRMRD_separate || workOrder_.CalibMode_==ISMRMRD_external )) )
+            {
+                RO = samples;
+            }
+
+            // find the loop counter boundary and allocate the buffer
+            GADGET_CONDITION_MSG(verboseMode_, "[RO E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+                               << RO 
+                               << " " << E1 
+                               << " " << meas_max_channel_ 
+                               << " " << meas_max_idx_.slice+1 
+                               << " " << E2 
+                               << " " << meas_max_idx_.contrast+1 
+                               << " " << meas_max_idx_.phase+1 
+                               << " " << meas_max_idx_.repetition+1 
+                               << " " << meas_max_idx_.set+1 
+                               << " " << meas_max_idx_.segment+1 << "]");
+
+            dimensions_.clear();
+            dimensions_.push_back(RO);
+            dimensions_.push_back(E1);
+            dimensions_.push_back(meas_max_channel_);
+            dimensions_.push_back(meas_max_idx_.slice+1);
+            dimensions_.push_back(E2);
+            dimensions_.push_back(meas_max_idx_.contrast+1);
+            dimensions_.push_back(meas_max_idx_.phase+1);
+            dimensions_.push_back(meas_max_idx_.repetition+1);
+            dimensions_.push_back(meas_max_idx_.set+1);
+            dimensions_.push_back(meas_max_idx_.segment+1);
+
+            size_t N = dimensions_.size();
+            for ( ii=0; ii<N; ii++ )
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "ref dimensions_[" << ii << "] = " << dimensions_[ii]);
+            }
+
+            // allocate data buffer
+            try
+            {
+                workOrder_.ref_.create(&dimensions_);
+                Gadgetron::clear(workOrder_.ref_);
+
+                std::vector<size_t> reflect_dimensions_(dimensions_);
+                reflect_dimensions_[0] = 1;
+                reflect_dimensions_[2] = 1;
+                workOrder_.reflect_ref_.create(&reflect_dimensions_);
+                Gadgetron::clear(workOrder_.reflect_ref_);
+            }
+            catch(...)
+            {
+                GADGET_DEBUG1("Failed create ref buffer\n");
+                return false;
+            }
+        }
+
+        // if necessary, shift the E1/E2 indexes
+        if ( workOrder_.CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder_.start_E1_ > 0 )
+            {
+                idx.kspace_encode_step_1 += workOrder_.start_E1_;
+            }
+
+            if ( workOrder_.start_E2_ > 0 )
+            {
+                idx.kspace_encode_step_2 += workOrder_.start_E2_;
+            }
+        }
+
+        // for the seperate or external mode, store the maximal idx
+        if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+        {
+            if ( idx.kspace_encode_step_1 > meas_max_idx_ref_.kspace_encode_step_1 )    meas_max_idx_ref_.kspace_encode_step_1 = idx.kspace_encode_step_1;
+            if ( idx.kspace_encode_step_2 > meas_max_idx_ref_.kspace_encode_step_2 )    meas_max_idx_ref_.kspace_encode_step_2 = idx.kspace_encode_step_2;
+            if ( idx.average > meas_max_idx_ref_.average )                              meas_max_idx_ref_.average = idx.average;
+            if ( idx.slice > meas_max_idx_ref_.slice )                                  meas_max_idx_ref_.slice = idx.slice;
+            if ( idx.contrast > meas_max_idx_ref_.contrast )                            meas_max_idx_ref_.contrast = idx.contrast;
+            if ( idx.phase > meas_max_idx_ref_.phase )                                  meas_max_idx_ref_.phase = idx.phase;
+            if ( idx.repetition > meas_max_idx_ref_.repetition )                        meas_max_idx_ref_.repetition = idx.repetition;
+            if ( idx.set > meas_max_idx_ref_.set )                                      meas_max_idx_ref_.set = idx.set;
+            if ( idx.segment > meas_max_idx_ref_.segment )                              meas_max_idx_ref_.segment = idx.segment;
+
+            size_t ii;
+            for ( ii=0; ii<ISMRMRD_USER_INTS; ii++ )
+            {
+                if ( idx.user[ii] > meas_max_idx_ref_.user[ii] ) meas_max_idx_ref_.user[ii] = idx.user[ii];
+            }
+        }
+
+        std::complex<float>* b = workOrder_.ref_.begin();
+        std::complex<float>* d = m2->getObjectPtr()->get_data_ptr();
+        if (samples != static_cast<int>(dimensions_[0])) 
+        {
+            GADGET_DEBUG1("Wrong number of samples received\n");
+            return false;
+        }
+
+        //Copy the data for all the channels
+        hoNDArray<std::complex<float> > reflectBuf;
+        if ( isReflect )
+        {
+            reflectBuf.create(samples);
+        }
+
+        std::vector<size_t> pos(10);
+        for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) 
+        {
+            pos[0] = 0;
+            pos[1] = idx.kspace_encode_step_1;
+            pos[2] = c;
+            pos[3] = idx.slice;
+            pos[4] = idx.kspace_encode_step_2;
+            pos[5] = idx.contrast;
+            pos[6] = idx.phase;
+            pos[7] = idx.repetition;
+            pos[8] = idx.set;
+            pos[9] = idx.segment;
+            long long offsetBuffer = workOrder_.ref_.calculate_offset(pos);
+
+            if ( isReflect )
+            {
+                for ( int s=0; s<samples; s++ )
+                {
+                    reflectBuf(samples-1-s) = d[c*samples+s];
+                }
+
+                memcpy(b+offsetBuffer, reflectBuf.begin(), sizeof(std::complex<float>)*samples);
+            }
+            else
+            {
+                memcpy(b+offsetBuffer, d+c*samples, sizeof(std::complex<float>)*samples);
+            }
+
+            pos[2] = 0;
+            offsetBuffer = workOrder_.reflect_ref_.calculate_offset(pos);
+            workOrder_.reflect_ref_.at(offsetBuffer) = isReflect;
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::storeRefData(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+fillBuffer(ReadOutBufferType& readOutBuffer, BufferType& buf, ReflectBufferType& reflectBuf)
+{
+    try
+    {
+        // find the maximal dimension of all buffered ICE readout
+        size_t numOfReadOuts = readOutBuffer.size();
+        ISMRMRD::EncodingCounters max_idx;
+        max_idx.kspace_encode_step_1 = 0;
+        max_idx.average = 0;
+        max_idx.slice = 0;
+        max_idx.kspace_encode_step_2 = 0;
+        max_idx.contrast = 0;
+        max_idx.phase = 0;
+        max_idx.repetition = 0;
+        max_idx.set = 0;
+        max_idx.segment = 0;
+        int max_channel = 0;
+        int max_col = 0;
+
+        size_t a;
+        for (a = 0; a < numOfReadOuts; a++) 
+        {
+            ISMRMRD::EncodingCounters idx = readOutBuffer[a].acqHead_.idx;
+
+            if ( readOutBuffer[a].acqHead_.number_of_samples > max_col ) 
+                max_col=readOutBuffer[a].acqHead_.number_of_samples;
+
+            if ( idx.kspace_encode_step_1 > max_idx.kspace_encode_step_1 ) 
+                max_idx.kspace_encode_step_1=idx.kspace_encode_step_1;
+
+            if ( idx.slice > max_idx.slice ) 
+                max_idx.slice = idx.slice;
+
+            if ( idx.kspace_encode_step_2 > max_idx.kspace_encode_step_2 ) 
+                max_idx.kspace_encode_step_2 = idx.kspace_encode_step_2;
+
+            if ( idx.contrast > max_idx.contrast ) 
+                max_idx.contrast = idx.contrast;
+
+            if ( idx.phase > max_idx.phase ) 
+                max_idx.phase = idx.phase;
+
+            if ( idx.repetition > max_idx.repetition ) 
+                max_idx.repetition = idx.repetition;
+
+            if ( idx.set > max_idx.set ) 
+                max_idx.set = idx.set;
+
+            if ( idx.segment > max_idx.segment ) 
+                max_idx.segment = idx.segment;
+
+            if ( readOutBuffer[a].acqHead_.active_channels > max_channel ) 
+                max_channel = readOutBuffer[a].acqHead_.active_channels;
+        }
+
+        GADGET_CONDITION_MSG(verboseMode_, "[RO E1 Cha Slice E2 Contrast Phase Rep Set Seg] = [" 
+                               << max_col 
+                               << " " << max_idx.kspace_encode_step_1+1 
+                               << " " << max_channel 
+                               << " " << max_idx.slice+1 
+                               << " " << max_idx.kspace_encode_step_2+1 
+                               << " " << max_idx.contrast+1 
+                               << " " << max_idx.phase+1 
+                               << " " << max_idx.repetition+1 
+                               << " " << max_idx.set+1 
+                               << " " << max_idx.segment+1 << "]");
+
+        // alloate buffer for data
+        std::vector<size_t> dims(10);
+        dims[0] = max_col;
+        dims[1] = max_idx.kspace_encode_step_1+1;
+        dims[2] = max_channel;
+        dims[3] = max_idx.slice+1;
+        dims[4] = max_idx.kspace_encode_step_2+1;
+        dims[5] = max_idx.contrast+1;
+        dims[6] = max_idx.phase+1;
+        dims[7] = max_idx.repetition+1;
+        dims[8] = max_idx.set+1;
+        dims[9] = max_idx.segment+1;
+
+        try
+        {
+            buf.create(&dims);
+            Gadgetron::clear(buf);
+
+            std::vector<size_t> reflect_dims(dims);
+            reflect_dims[0] = 1;
+            reflect_dims[2] = 1;
+            reflectBuf.create(&reflect_dims);
+            Gadgetron::clear(reflectBuf);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Failed create buffer\n");
+            return false;
+        }
+
+        std::complex<float>* b = buf.begin();
+
+        // copy the data
+        int c;
+        std::vector<size_t> pos(10);
+
+        for ( a=0; a<numOfReadOuts; a++) 
+        {
+            ISMRMRD::EncodingCounters idx = readOutBuffer[a].acqHead_.idx;
+            std::complex<float>* d = const_cast<std::complex<float>*>(readOutBuffer[a].data_.begin());
+
+            for ( c=0; c<readOutBuffer[a].acqHead_.active_channels; c++) 
+            {
+                pos[0] = 0;
+                pos[1] = idx.kspace_encode_step_1;
+                pos[2] = c;
+                pos[3] = idx.slice;
+                pos[4] = idx.kspace_encode_step_2;
+                pos[5] = idx.contrast;
+                pos[6] = idx.phase;
+                pos[7] = idx.repetition;
+                pos[8] = idx.set;
+                pos[9] = idx.segment;
+                long long offsetBuffer = buf.calculate_offset(pos);
+
+                memcpy(b+offsetBuffer, d+c*readOutBuffer[a].acqHead_.number_of_samples, sizeof(std::complex<float>)*readOutBuffer[a].acqHead_.number_of_samples);
+
+                pos[2] = 0;
+                offsetBuffer = reflectBuf.calculate_offset(pos);
+                reflectBuf.at(offsetBuffer) = readOutBuffer[a].isReflect_;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::fillBuffer(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::fillImageInfo(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GtPlusGadgetImageArray* messageImage, const ISMRMRD::EncodingCounters& idx)
+{
+    try
+    {
+        // fill the message info
+        int offset = messageImage->get_offset(idx.slice, idx.kspace_encode_step_2, idx.contrast, idx.phase, idx.repetition, idx.set, idx.segment);
+
+        // if it is the first acq in a slice, fill in all information
+        bool is_first_acq_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_FIRST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+        /*if ( is_first_acq_in_slice 
+            || ( messageImage->imageArray_[offset].version==0 
+                    && messageImage->imageArray_[offset].flags==0 
+                    && messageImage->imageArray_[offset].measurement_uid==0 ) )*/
+        if ( messageImage->imageArray_[offset].version==0 
+                    && messageImage->imageArray_[offset].flags==0 
+                    && messageImage->imageArray_[offset].measurement_uid==0 )
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "--> buffer image header - offset = " << offset << " - [SLC E2 CON PHS REP SET] = [" 
+                                                                      << idx.slice << " " 
+                                                                      << idx.kspace_encode_step_2 << " " 
+                                                                      << idx.contrast << " " 
+                                                                      << idx.phase << " " 
+                                                                      << idx.repetition << " " 
+                                                                      << idx.set << "]");
+
+            messageImage->imageArray_[offset].version = m1->getObjectPtr()->version;
+            messageImage->imageArray_[offset].flags = m1->getObjectPtr()->flags;
+            messageImage->imageArray_[offset].measurement_uid = m1->getObjectPtr()->measurement_uid;
+
+            //messageImage->imageArray_[offset].matrix_size[0] = dimensions_[0];
+            //messageImage->imageArray_[offset].matrix_size[1] = dimensions_[1];
+            //messageImage->imageArray_[offset].matrix_size[2] = dimensions_[2];
+
+            messageImage->imageArray_[offset].set_matrix_size(0, dimensions_[0]);
+            messageImage->imageArray_[offset].set_matrix_size(1, dimensions_[1]);
+            messageImage->imageArray_[offset].set_matrix_size(2, dimensions_[2]);
+
+            messageImage->imageArray_[offset].field_of_view[0] = field_of_view_recon_[0];
+            messageImage->imageArray_[offset].field_of_view[1] = field_of_view_recon_[1];
+            messageImage->imageArray_[offset].field_of_view[2] = field_of_view_recon_[2];
+
+            messageImage->imageArray_[offset].channels = m1->getObjectPtr()->active_channels;
+
+            messageImage->imageArray_[offset].position[0] = m1->getObjectPtr()->position[0];
+            messageImage->imageArray_[offset].position[1] = m1->getObjectPtr()->position[1];
+            messageImage->imageArray_[offset].position[2] = m1->getObjectPtr()->position[2];
+
+            //messageImage->imageArray_[offset].quaternion[0] = m1->getObjectPtr()->quaternion[0];
+            //messageImage->imageArray_[offset].quaternion[1] = m1->getObjectPtr()->quaternion[1];
+            //messageImage->imageArray_[offset].quaternion[2] = m1->getObjectPtr()->quaternion[2];
+            //messageImage->imageArray_[offset].quaternion[3] = m1->getObjectPtr()->quaternion[3];
+
+            messageImage->imageArray_[offset].read_dir[0] = m1->getObjectPtr()->read_dir[0];
+            messageImage->imageArray_[offset].read_dir[1] = m1->getObjectPtr()->read_dir[1];
+            messageImage->imageArray_[offset].read_dir[2] = m1->getObjectPtr()->read_dir[2];
+
+            messageImage->imageArray_[offset].phase_dir[0] = m1->getObjectPtr()->phase_dir[0];
+            messageImage->imageArray_[offset].phase_dir[1] = m1->getObjectPtr()->phase_dir[1];
+            messageImage->imageArray_[offset].phase_dir[2] = m1->getObjectPtr()->phase_dir[2];
+
+            messageImage->imageArray_[offset].slice_dir[0] = m1->getObjectPtr()->slice_dir[0];
+            messageImage->imageArray_[offset].slice_dir[1] = m1->getObjectPtr()->slice_dir[1];
+            messageImage->imageArray_[offset].slice_dir[2] = m1->getObjectPtr()->slice_dir[2];
+
+            messageImage->imageArray_[offset].patient_table_position[0] = m1->getObjectPtr()->patient_table_position[0];
+            messageImage->imageArray_[offset].patient_table_position[1] = m1->getObjectPtr()->patient_table_position[1];
+            messageImage->imageArray_[offset].patient_table_position[2] = m1->getObjectPtr()->patient_table_position[2];
+
+            messageImage->imageArray_[offset].average = m1->getObjectPtr()->idx.average;
+            messageImage->imageArray_[offset].slice = m1->getObjectPtr()->idx.slice;
+            messageImage->imageArray_[offset].contrast = m1->getObjectPtr()->idx.contrast;
+            messageImage->imageArray_[offset].phase = m1->getObjectPtr()->idx.phase;
+            messageImage->imageArray_[offset].repetition = m1->getObjectPtr()->idx.repetition;
+            messageImage->imageArray_[offset].set = m1->getObjectPtr()->idx.set;
+
+            messageImage->imageArray_[offset].acquisition_time_stamp = m1->getObjectPtr()->acquisition_time_stamp;
+
+            messageImage->imageArray_[offset].physiology_time_stamp[0] = m1->getObjectPtr()->physiology_time_stamp[0];
+            messageImage->imageArray_[offset].physiology_time_stamp[1] = m1->getObjectPtr()->physiology_time_stamp[1];
+            messageImage->imageArray_[offset].physiology_time_stamp[2] = m1->getObjectPtr()->physiology_time_stamp[2];
+
+            messageImage->imageArray_[offset].image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+
+            messageImage->imageArray_[offset].image_type = ISMRMRD::TYPE_MAGNITUDE;
+
+            messageImage->imageArray_[offset].image_index = ++image_counter_;
+            messageImage->imageArray_[offset].image_series_index = image_series_;
+
+            // need to store the free user parameters
+            memcpy(messageImage->imageArray_[offset].user_int, m1->getObjectPtr()->user_int, sizeof(int32_t)*8);
+            memcpy(messageImage->imageArray_[offset].user_float, m1->getObjectPtr()->user_float, sizeof(float)*8);
+        }
+
+        // whether or not this acq is the first in a slice, we need to fill the TimeStamps and PMUTimeStamps
+        messageImage->imageArray_[offset].time_stamps[idx.kspace_encode_step_1] = m1->getObjectPtr()->acquisition_time_stamp;
+        messageImage->imageArray_[offset].pmu_time_stamps[idx.kspace_encode_step_1] = m1->getObjectPtr()->physiology_time_stamp[0];
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::fillImageInfo(...) ... \n");
+        return false;
+    }
+
+    return true;
+}
+
+size_t GtPlusAccumulatorWorkOrderTriggerGadget::
+computeEncodedSizeE1(size_t centerE1, size_t maxE1)
+{
+    int E1;
+    if ( (maxE1+1)%2 == 0 )
+    {
+        E1 = 2*centerE1;
+    }
+    else
+    {
+        E1 = 2*centerE1+1;
+    }
+
+    return E1;
+}
+
+size_t GtPlusAccumulatorWorkOrderTriggerGadget::
+computeEncodedSizeE2(size_t centerE2, size_t maxE2)
+{
+    int E2;
+    if ( (maxE2+1)%2 == 0 )
+    {
+        E2 = 2*centerE2;
+    }
+    else
+    {
+        E2 = 2*centerE2+1;
+    }
+
+    return E2;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDimEqual(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDimEqual(triggerDim, value) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+        cm1->cont(cm2);
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        bool lessEqual = false;
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim, value, lessEqual));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim, value, lessEqual));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim, value, lessEqual));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArrayEqual(triggerDim, value, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim, value, lessEqual));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDimEqual(triggerDim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDimLessEqual(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDimEqual(triggerDim, value) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+        cm1->cont(cm2);
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        bool lessEqual = true;
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim, value, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim, value, lessEqual));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim, value, lessEqual));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim, value, lessEqual));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArrayLessEqual(triggerDim, value, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim, value, lessEqual));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDimLessEqual(triggerDim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDimEqual(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDimEqual(triggerDim1, value1, triggerDim2, value2) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+        cm1->cont(cm2);
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        bool lessEqual = false;
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim1, value1, triggerDim2, value2, lessEqual));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArrayEqual(triggerDim1, value1, triggerDim2, value2, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim1, value1, false));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDimEqual(triggerDim1, value1, triggerDim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::
+triggerByDim1LessEqualDim2Equal(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_)
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerByDim1LessEqualDim2Equal(triggerDim1, value1, triggerDim2, value2) ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = workFlow_BufferKernel_;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+
+        cm1->cont(cm2);
+
+        // copy the image content
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.data_, cm2->getObjectPtr()->data_, triggerDim1, value1, triggerDim2, value2));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.reflect_, cm2->getObjectPtr()->reflect_, triggerDim1, value1, triggerDim2, value2));
+
+        // copy the ref
+        if ( workOrder_.ref_.get_number_of_elements()>0 
+                && workOrder_.ref_.get_number_of_dimensions()==workOrder_.data_.get_number_of_dimensions() )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.ref_, cm2->getObjectPtr()->ref_, triggerDim1, value1, triggerDim2, value2));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForDim1LessEqualDim2Equal(workOrder_.reflect_ref_, cm2->getObjectPtr()->reflect_ref_, triggerDim1, value1, triggerDim2, value2));
+
+            // for seperate and external mode, further truncate the reference data
+            if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+            {
+                hoNDArray<ValueType> ref;
+                hoNDArray<unsigned short> reflect_ref;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+                cm2->getObjectPtr()->ref_ = ref;
+                cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+            }
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(messageImage_->extractGadgetImageArray_Dim1LessEqual_Dim2Equal(triggerDim1, value1, triggerDim2, value2, *(cm1->getObjectPtr()) ));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return false;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForDim(workOrder_.other_, cm2->getObjectPtr()->other_, triggerDim1, value1, true));
+
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerByDim1LessEqualDim2Equal(triggerDim1, value1, triggerDim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusAccumulatorWorkOrderTriggerGadget::triggerWorkOrderAllInClose()
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - triggerWorkOrderAllInClose ... ");
+
+        GadgetContainerMessage<GtPlusGadgetImageArray>* cm1 = new GadgetContainerMessage<GtPlusGadgetImageArray>();
+        GadgetContainerMessage< WorkOrderType >* cm2 = new GadgetContainerMessage< WorkOrderType >();
+
+        workOrder_.duplicate(*cm2->getObjectPtr());
+        cm2->getObjectPtr()->workFlow_BufferKernel_ = false;
+        cm2->getObjectPtr()->workFlow_use_BufferedKernel_ = false;
+
+        cm1->cont(cm2);
+
+        // copy the image content
+        cm2->getObjectPtr()->data_ = workOrder_.data_;
+        cm2->getObjectPtr()->reflect_ = workOrder_.reflect_;
+
+        // copy the ref
+        cm2->getObjectPtr()->ref_ = workOrder_.ref_;
+        cm2->getObjectPtr()->reflect_ref_ = workOrder_.reflect_ref_;
+
+        // for seperate and external mode, further truncate the reference data
+        if ( (workOrder_.CalibMode_ == ISMRMRD_separate) || (workOrder_.CalibMode_ == ISMRMRD_external) )
+        {
+            hoNDArray<ValueType> ref;
+            hoNDArray<unsigned short> reflect_ref;
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<ValueType>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->ref_, ref, meas_max_idx_ref_));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<unsigned short>().extractSubArrayForMaxEncodingCounters(cm2->getObjectPtr()->reflect_ref_, reflect_ref, meas_max_idx_ref_));
+
+            cm2->getObjectPtr()->ref_ = ref;
+            cm2->getObjectPtr()->reflect_ref_ = reflect_ref;
+        }
+
+        // copy the message image array
+        GADGET_CHECK_RETURN_FALSE(cm1->getObjectPtr()->copy(*messageImage_));
+
+        if (!phaseCorrBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorWorkOrderTriggerGadget - phase correction signal found : " << phaseCorrBuffer_.size());
+
+            if ( !fillBuffer(phaseCorrBuffer_, workOrder_.phaseCorr_, workOrder_.reflect_phaseCorr_) )
+            {
+                GADGET_DEBUG1("fillBuffer(phaseCorrBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->phaseCorr_ = workOrder_.phaseCorr_;
+            cm2->getObjectPtr()->reflect_phaseCorr_ = workOrder_.reflect_phaseCorr_;
+        }
+
+        if (!noiseBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - noise signal found : " << noiseBuffer_.size());
+
+            ReflectBufferType tmpBuf;
+            if ( !fillBuffer(noiseBuffer_, workOrder_.noise_, tmpBuf) )
+            {
+                GADGET_DEBUG1("fillBuffer(noiseBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->noise_ = workOrder_.noise_;
+        }
+
+        if (!otherBuffer_.empty())
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "GtPlusAccumulatorGadget - other signal found : " << otherBuffer_.size());
+
+            if ( !fillBuffer(otherBuffer_, workOrder_.other_, workOrder_.reflect_other_) )
+            {
+                GADGET_DEBUG1("fillBuffer(otherBuffer_) failed ... \n");
+                cm1->release();
+                return GADGET_FAIL;
+            }
+
+            cm2->getObjectPtr()->other_ = workOrder_.other_;
+            cm2->getObjectPtr()->reflect_other_ = workOrder_.reflect_other_;
+        }
+
+        // send to next gadget
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return GADGET_FAIL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusAccumulatorWorkOrderTriggerGadget::triggerWorkOrderAllInClose() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+size_t GtPlusAccumulatorWorkOrderTriggerGadget::
+getDimValue(const ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::gtPlus::ISMRMRDDIM& dim)
+{
+    if ( dim == DIM_Encoding1 ) return acqHeader.idx.kspace_encode_step_1;
+    if ( dim == DIM_Slice ) return acqHeader.idx.slice;
+    if ( dim == DIM_Encoding2 ) return acqHeader.idx.kspace_encode_step_2;
+    if ( dim == DIM_Contrast ) return acqHeader.idx.contrast;
+    if ( dim == DIM_Phase ) return acqHeader.idx.phase;
+    if ( dim == DIM_Repetition ) return acqHeader.idx.repetition;
+    if ( dim == DIM_Set ) return acqHeader.idx.set;
+    if ( dim == DIM_Segment ) return acqHeader.idx.segment;
+    if ( dim == DIM_Average ) return acqHeader.idx.average;
+
+    return 0;
+}
+
+void GtPlusAccumulatorWorkOrderTriggerGadget::
+setDimValue(ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t value)
+{
+    if ( dim == DIM_Encoding1 ) acqHeader.idx.kspace_encode_step_1 = value;
+    if ( dim == DIM_Slice ) acqHeader.idx.slice = value;
+    if ( dim == DIM_Encoding2 ) acqHeader.idx.kspace_encode_step_2 = value;
+    if ( dim == DIM_Contrast ) acqHeader.idx.contrast = value;
+    if ( dim == DIM_Phase ) acqHeader.idx.phase = value;
+    if ( dim == DIM_Repetition ) acqHeader.idx.repetition = value;
+    if ( dim == DIM_Set ) acqHeader.idx.set = value;
+    if ( dim == DIM_Segment ) acqHeader.idx.segment = value;
+    if ( dim == DIM_Average ) acqHeader.idx.average = value;
+
+    return;
+}
+
+int GtPlusAccumulatorWorkOrderTriggerGadget::close(unsigned long flags)
+{
+    GADGET_CONDITION_MSG(true, "GtPlusAccumulatorWorkOrderTriggerGadget - close(flags) : " << flags);
+
+    if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+    // if ( flags!=0 && !triggered_in_close_ )
+    if ( !triggered_in_close_ )
+    {
+        triggered_in_close_ = true;
+
+        GADGET_CONDITION_MSG(true, "GtPlusAccumulatorWorkOrderTriggerGadget - trigger in close(flags) ... ");
+
+        if ( needTriggerWorkOrderAllInClose() )
+        {
+            // never been triggered, so need to trigger with all data buffered
+            if ( !triggerWorkOrderAllInClose() )
+            {
+                GADGET_DEBUG1("triggerWorkOrderAllInClose() failed ... \n");
+                return GADGET_FAIL;
+            }
+        }
+        else
+        {
+            // need to trigger the last portion of kspace
+            //if ( !triggerWorkOrder(NULL, true, true) )
+            //{
+            //    GADGET_DEBUG1("Failed triggerWorkOrder(inClose)\n");
+            //    return GADGET_FAIL;
+            //}
+        }
+    }
+
+    // return BaseClass::close(flags);
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusAccumulatorWorkOrderTriggerGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.h b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.h
new file mode 100644
index 0000000..31bf47f
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusAccumulatorWorkOrderTriggerGadget.h
@@ -0,0 +1,266 @@
+/** \file   GtPlusAccumulatorWorkOrderTriggerGadget.h
+    \brief  The GtPlus reconstruction data accmulation and triggering gadget
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include "hoNDArray_utils.h"
+
+#include "GtPlusGadgetImageArray.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+
+namespace Gadgetron
+{
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+//   0  1  2   3    4   5    6     7  8   9
+
+struct ReadOutBuffer
+{
+    ISMRMRD::AcquisitionHeader acqHead_;
+    hoNDArray< std::complex<float> > data_;
+    bool isReflect_;
+};
+
+class EXPORTGTPLUSGADGET GtPlusAccumulatorWorkOrderTriggerGadget : public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+    GADGET_DECLARE(GtPlusAccumulatorGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< ValueType > > BaseClass;
+
+    typedef std::vector< ReadOutBuffer > ReadOutBufferType;
+    typedef hoNDArray< std::complex<float> > BufferType;
+    typedef hoNDArray< int > TimeStampBufferType;
+    typedef hoNDArray< unsigned short > ReflectBufferType;
+
+    // typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    GtPlusAccumulatorWorkOrderTriggerGadget();
+    ~GtPlusAccumulatorWorkOrderTriggerGadget();
+
+    virtual int close(unsigned long flags);
+
+    /// parameters to control the triggering
+
+    /// for interleaved mode
+    // if DIM_NONE, the trigger is performed in the close function
+
+    // the triggering for interleaved mode is defined as:
+    // a) if triggerDim1_==DIM_NONE, and triggerDim2_ != DIM_NONE
+    // the trigger is performed whenever the triggerDim2_ is changed
+
+    // b) if triggerDim2_==NONE and triggerDim1_ != NONE, 
+    // if numOfKSpace_triggerDim1_==0, 
+    // the trigger is performed whenever the triggerDim1_ is changed
+
+    // if numOfKSpace_triggerDim1_>0, 
+    // the trigger is first performed when numOfKSpace_triggerDim1_ of triggerDim1_ kspace is buffered, 
+    // then trigger is performed whenever a new triggerDim1_ kspace arrives
+    // the new triggerDim1_ kspace will be reconed using the kernel estimated from numOfKSpace_triggerDim1_ of triggerDim1_
+
+    // when the resetTriggerStatus(m1)==true , the status is resetted and the numOfKSpace_triggerDim1_ of triggerDim1_ kspace 
+    // will be buffered and then trigger the recon
+
+    // c) if both triggerDim1_ and triggerDim2_ are NONE, the trigger is performed 
+    // in the close(flags) functions
+
+    // e) if both triggerDim1_ and triggerDim2_ are NOT NONE, the trigger is first performed
+    // when numOfKSpace_triggerDim1_ of triggerDim1_ kspace is buffered and then trigger is performed whenever a new 
+    // triggerDim1_ kspace arrives
+    // the new triggerDim1_ kspace will be reconed using the kernel estimated from numOfKSpace_triggerDim1_ of triggerDim1_
+    // when the triggerDim2_ changes or resetTriggerStatus(m1)==true , the status is resetted and the numOfKSpace_triggerDim1_ of triggerDim1_ kspace 
+    // will be buffered and then trigger the recon
+
+    // f) if numOfKSpace_triggerDim1_==0 and both triggerDim1_ and triggerDim2_ are NOT NONE,
+    // the trigger is performed whenever the triggerDim2_ is changed
+
+    // noacceleration
+    Gadgetron::gtPlus::ISMRMRDDIM noacceleration_triggerDim1_;
+    Gadgetron::gtPlus::ISMRMRDDIM noacceleration_triggerDim2_;
+    int noacceleration_numOfKSpace_triggerDim1_;
+
+    // interleaved
+    Gadgetron::gtPlus::ISMRMRDDIM interleaved_triggerDim1_;
+    Gadgetron::gtPlus::ISMRMRDDIM interleaved_triggerDim2_;
+    int interleaved_numOfKSpace_triggerDim1_;
+
+    // embedded
+    Gadgetron::gtPlus::ISMRMRDDIM embedded_triggerDim1_;
+    Gadgetron::gtPlus::ISMRMRDDIM embedded_triggerDim2_;
+    int embedded_numOfKSpace_triggerDim1_;
+
+    // separate
+    Gadgetron::gtPlus::ISMRMRDDIM separate_triggerDim1_;
+    Gadgetron::gtPlus::ISMRMRDDIM separate_triggerDim2_;
+    int separate_numOfKSpace_triggerDim1_;
+
+    // for other kspace data, if other_kspace_matching_Dim != DIM_NONE, the other data dimension will be made to match the image data at 
+    // dimension other_kspace_matching_Dim
+    Gadgetron::gtPlus::ISMRMRDDIM other_kspace_matching_Dim_;
+
+    // default behavior is to compare the readout geometry
+    // if the imaging slice changes, the trigger status is reset
+    virtual bool resetTriggerStatus(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1);
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1, GadgetContainerMessage< hoNDArray< ValueType > > * m2);
+
+    // check the status of incoming readout
+    // bIsKSpace: whether this data is for image
+    // bIsRef: whether this data is for calibration signal
+    // bIsNoise: whether this data is a noise scan
+    // bIsPhaseCorr: whether this data is for phase correction
+    // bIsReflect: whether this data is acquired reflectly (for EPI and similar scans)
+    // bIsOther: other scans
+    virtual bool checkStatus(uint64_t flag, int samples, 
+                        bool& bIsKSpace, bool& bIsRef, bool& bIsNoise, bool& bIsPhaseCorr, bool& bIsReflect, bool& bIsOther,
+                        bool& bIsNavigator, bool& bIsRTFeedback, bool& bIsHPFeedback, bool& bIsDummyScan);
+
+    // store the image data
+    virtual bool storeImageData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect);
+
+    // store the ref data
+    virtual bool storeRefData(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2, bool isReflect);
+
+    // fill the dynamically buffered data
+    virtual bool fillBuffer(ReadOutBufferType& readOutBuffer, BufferType& buf, ReflectBufferType& reflectBuf);
+
+    // fill the per 2D image info
+    virtual bool fillImageInfo(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, GtPlusGadgetImageArray* messageImage, const ISMRMRD::EncodingCounters& idx);
+
+    // compute the encoded size
+    size_t computeEncodedSizeE1(size_t centerE1, size_t maxE1);
+    size_t computeEncodedSizeE2(size_t centerE2, size_t maxE2);
+
+    // perform the triggering
+    virtual bool triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, bool inClose, bool isKSpace);
+
+    // workFlow_BufferKernel_ and workFlow_use_BufferedKernel_ is the command to work flow
+    // if workFlow_BufferKernel_ == true, work flow will buffer the kernels computed for this work order
+    // if workFlow_use_BufferedKernel_ == true, work flow will recon this work order using buffered kernels
+    // if both triggerDim1_ and triggerDim2_ are NOT NONE and numOfKSpace_triggerDim1_ > 0, 
+    // the first work order with workFlow_BufferKernel_==true will be sent out when values of triggerDim1_  equals numOfKSpace_triggerDim1_-1
+    // the next work orders will be sent out when triggerDim1_ changes with workFlow_BufferKernel_==false and workFlow_use_BufferedKernel_==true
+    // when the triggerDim2_ changes, the status will be reset
+    virtual bool triggerWorkOrder(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1, 
+            Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1_, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2_, int numOfKSpace_triggerDim1_);
+
+    // trigger by extract array with triggerDim being value
+    virtual bool triggerByDimEqual(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+    virtual bool triggerByDimLessEqual(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim, size_t value, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+    virtual bool triggerByDimEqual(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+
+    // trigger by extract array with triggerDim being <= value
+    virtual bool triggerByDim1LessEqualDim2Equal(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2, size_t value2, bool workFlow_BufferKernel_, bool workFlow_use_BufferedKernel_);
+
+    // whether need to trigger all buffered data in close()
+    bool needTriggerWorkOrderAllInClose();
+    // trigger with all buffered data
+    virtual bool triggerWorkOrderAllInClose();
+
+    // trigger the last count in the close function
+    virtual bool triggerWorkOrderLastCountInClose(Gadgetron::gtPlus::ISMRMRDDIM& triggerDim1_, Gadgetron::gtPlus::ISMRMRDDIM& triggerDim2_, int numOfKSpace_triggerDim1_);
+
+    size_t getDimValue(const ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::gtPlus::ISMRMRDDIM& dim);
+    void setDimValue(ISMRMRD::AcquisitionHeader& acqHeader, Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t value);
+
+    // buffer for per 2D image information
+    GtPlusGadgetImageArray* messageImage_;
+
+    // buffer for image kspace data
+    // if the partial fourier is used, the kspace center is put at the center of buffer
+    // this means zeros will be added accordingly
+    BufferType kspaceBuffer_;
+    BufferType refBuffer_;
+
+    // dynamic buffer for other kspace data
+    ReadOutBufferType noiseBuffer_;
+    ReadOutBufferType phaseCorrBuffer_;
+    ReadOutBufferType otherBuffer_;
+
+    // dimension for image kspace
+    std::vector<size_t> dimensions_;
+
+    // encoding matrix size (the real sampled size)
+    size_t matrix_size_encoding_[3];
+
+    // encoding space size (the logic kspace size)
+    size_t space_size_[3];
+
+    // offset along E1 and E2 directions for incoming readouts
+    size_t space_matrix_offset_E1_;
+    size_t space_matrix_offset_E2_;
+
+    // encoding filed of view [mm]
+    float field_of_view_encoding_[3];
+
+    // recon matrix size (the final image size)
+    size_t matrix_size_recon_[3];
+
+    // recon filed of view [mm]
+    float field_of_view_recon_[3];
+
+    int image_counter_;
+    int image_series_;
+
+    // mark the first kspace line
+    bool first_kspace_scan_;
+
+    // whether the next gadget has been triggered in close(...)
+    bool triggered_in_close_;
+
+    // whether the next gadget has been triggered in process(...)
+    bool triggered_in_process_;
+
+    // whether the next gadget has been triggered in process(...) for the last acquisition
+    // if so, extra triggering in close(...) is not needed
+    bool triggered_in_process_last_acq_;
+
+    int meas_max_ro_;
+    ISMRMRD::EncodingCounters meas_max_idx_;
+    int meas_max_channel_;
+
+    // maximal idx for reference data
+    ISMRMRD::EncodingCounters meas_max_idx_ref_;
+
+    // track the trigger dim1 and dim2
+    int prev_dim1_;
+    int curr_dim1_;
+
+    int prev_dim2_;
+    int curr_dim2_;
+
+    // store the previous acquisition head
+    ISMRMRD::AcquisitionHeader prev_acq_header_;
+
+    // for trigger dim1, need to count its times
+    int count_dim1_;
+
+    // a general workorder to store the buffered data
+    WorkOrderType workOrder_;
+
+    // util for gtplus
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<GT_Complex8> gtPlus_util_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetExport.h b/gadgets/gtPlus/GtPlusGadgetExport.h
new file mode 100644
index 0000000..b7369d4
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetExport.h
@@ -0,0 +1,16 @@
+/** \file   GtPlusGadgetExport.h
+    \brief  The windows export/import definition for the GtPlus reconstruction gadget
+    \author Hui Xue
+*/
+
+#pragma once
+
+#if defined (WIN32)
+    #if defined (__BUILD_GADGETS__) || defined (gadgetronPlus_EXPORTS)
+        #define EXPORTGTPLUSGADGET __declspec(dllexport)
+    #else
+        #define EXPORTGTPLUSGADGET __declspec(dllimport)
+    #endif
+#else
+    #define EXPORTGTPLUSGADGET
+#endif
diff --git a/gadgets/gtPlus/GtPlusGadgetImageArray.cpp b/gadgets/gtPlus/GtPlusGadgetImageArray.cpp
new file mode 100644
index 0000000..ba884d9
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetImageArray.cpp
@@ -0,0 +1,664 @@
+
+#include "GtPlusGadgetImageArray.h"
+
+namespace Gadgetron
+{
+
+GtPlusGadgetImageExt::GtPlusGadgetImageExt() : ISMRMRD::ImageHeader()
+{
+    version = 0;
+    flags = 0;
+    measurement_uid = 0;
+
+    matrix_size[0] = 0; matrix_size[1] = 0; matrix_size[2] = 0;
+    field_of_view[0] = 0; field_of_view[1] = 0; field_of_view[2] = 0;
+    channels = 0;
+    memset(position, 0, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+    memset(read_dir, 0, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+    memset(phase_dir, 0, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+    memset(slice_dir, 0, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+    memset(patient_table_position, 0, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+
+    average = 0;
+    slice = 0;
+    contrast = 0;
+    phase = 0;
+    repetition = 0;
+    set = 0;
+    acquisition_time_stamp = 0;
+    memset(physiology_time_stamp, 0, sizeof(uint32_t)*ISMRMRD_PHYS_STAMPS);
+
+    image_data_type = 0;
+    image_type = 0;
+    image_index = 0;
+    image_series_index = 0;
+
+    memset(user_int, 0, sizeof(int32_t)*ISMRMRD_USER_INTS);
+    memset(user_float, 0, sizeof(float)*ISMRMRD_USER_FLOATS);
+
+    time_stamps.clear();
+    pmu_time_stamps.clear();
+}
+
+GtPlusGadgetImageExt::~GtPlusGadgetImageExt()
+{
+}
+
+void GtPlusGadgetImageExt::set_matrix_size(size_t index, ACE_UINT16 size)
+{
+    if (index < 3) 
+    {
+        matrix_size[index] = size;
+    }
+
+    if ( index == 1 )
+    {
+        time_stamps.clear();
+        time_stamps.resize(matrix_size[1], -1);
+        pmu_time_stamps.clear();
+        pmu_time_stamps.resize(matrix_size[1], -1);
+    }
+}
+
+void GtPlusGadgetImageExt::copy(GtPlusGadgetImageExt& aMessageImage)
+{
+    version = aMessageImage.version;
+    flags = aMessageImage.flags;
+    measurement_uid = aMessageImage.measurement_uid;
+
+    matrix_size[0] = aMessageImage.matrix_size[0];
+    matrix_size[1] = aMessageImage.matrix_size[1];
+    matrix_size[2] = aMessageImage.matrix_size[2];
+
+    field_of_view[0] = aMessageImage.field_of_view[0];
+    field_of_view[1] = aMessageImage.field_of_view[1];
+    field_of_view[2] = aMessageImage.field_of_view[2];
+
+    channels = aMessageImage.channels;
+
+    memcpy(position, aMessageImage.position, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+    memcpy(read_dir, aMessageImage.read_dir, sizeof(float)*ISMRMRD_DIRECTION_LENGTH);
+    memcpy(phase_dir, aMessageImage.phase_dir, sizeof(float)*ISMRMRD_DIRECTION_LENGTH);
+    memcpy(slice_dir, aMessageImage.slice_dir, sizeof(float)*ISMRMRD_DIRECTION_LENGTH);
+    memcpy(patient_table_position, aMessageImage.patient_table_position, sizeof(float)*ISMRMRD_POSITION_LENGTH);
+
+    average = aMessageImage.average;
+    slice = aMessageImage.slice;
+    contrast = aMessageImage.contrast;
+    phase = aMessageImage.phase;
+    repetition = aMessageImage.repetition;
+    set = aMessageImage.set;
+
+    acquisition_time_stamp = aMessageImage.acquisition_time_stamp;
+
+    memcpy(physiology_time_stamp, aMessageImage.physiology_time_stamp, sizeof(uint32_t)*ISMRMRD_PHYS_STAMPS);
+
+    image_data_type = aMessageImage.image_data_type;
+    image_type = aMessageImage.image_type;
+    image_index = aMessageImage.image_index;
+    image_series_index = aMessageImage.image_series_index;
+
+    memcpy(user_int, aMessageImage.user_int, sizeof(int32_t)*ISMRMRD_USER_INTS);
+    memcpy(user_float, aMessageImage.user_float, sizeof(float)*ISMRMRD_USER_FLOATS);
+
+    time_stamps = aMessageImage.time_stamps;
+    pmu_time_stamps = aMessageImage.pmu_time_stamps;
+}
+
+void GtPlusGadgetImageExt::recomputeHeader(const GtPlusGadgetImageExt& aMessageImage, double weight)
+{
+    size_t ii;
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        position[ii] = (position[ii]*weight) + (1.0-weight)*aMessageImage.position[ii];
+        patient_table_position[ii] = (patient_table_position[ii]*weight) + (1.0-weight)*aMessageImage.patient_table_position[ii];
+    }
+
+    acquisition_time_stamp = (uint32_t)((acquisition_time_stamp*weight) + (1.0-weight)*aMessageImage.acquisition_time_stamp + 0.5);
+
+    for ( ii=0; ii<ISMRMRD_PHYS_STAMPS; ii++ )
+    {
+        physiology_time_stamp[ii] = (uint32_t)((physiology_time_stamp[ii]*weight) + (1.0-weight)*aMessageImage.physiology_time_stamp[ii] + 0.5);
+    }
+}
+
+void GtPlusGadgetImageExt::dump()
+{
+    using namespace std;
+
+    cout << "GtPlusGadgetImageExt" << endl;
+    cout << "----------------------------------------------------------" << endl;
+    cout << "version            : " << version << endl;
+    cout << "flags              : " << flags << endl;
+    cout << "measurement_uid    : " << measurement_uid << endl;
+    cout << "matrix_size[3]     : " << matrix_size[0] << " " << matrix_size[1] << " " << matrix_size[2] << endl;
+    cout << "field_of_view[3]   : " << field_of_view[0] << " " << field_of_view[1] << " " << field_of_view[2] << endl;
+    cout << "channels           : " << channels << endl;
+
+    size_t ii;
+
+    cout << "position[ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << position[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "read_dir[ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << read_dir[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "phase_dir[ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << phase_dir[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "slice_dir[ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << slice_dir[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "patient_table_position[ISMRMRD_POSITION_LENGTH]      : ";
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        cout << patient_table_position[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "average            : " << average << endl;
+    cout << "slice              : " << slice << endl;
+    cout << "contrast           : " << contrast << endl;
+    cout << "phase              : " << phase << endl;
+    cout << "repetition         : " << repetition << endl;
+    cout << "set                : " << set << endl;
+    cout << "acquisition_time_stamp : " << acquisition_time_stamp << endl;
+
+    cout << "physiology_time_stamp[ISMRMRD_PHYS_STAMPS] : ";
+    for ( ii=0; ii<ISMRMRD_PHYS_STAMPS; ii++ )
+    {
+        cout << physiology_time_stamp[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "image_data_type    : " << image_data_type << endl;
+    cout << "image_type         : " << image_type << endl;
+    cout << "image_index        : " << image_index << endl;
+    cout << "image_series_index : " << image_series_index << endl;
+
+    cout << "user_int[ISMRMRD_USER_INTS]        : ";
+    for ( ii=0; ii<ISMRMRD_USER_INTS; ii++ )
+    {
+        cout << user_int[ii] << " ";
+    }
+    cout << endl;
+
+    cout << "user_float[ISMRMRD_USER_FLOATS]    : ";
+    for ( ii=0; ii<ISMRMRD_USER_FLOATS; ii++ )
+    {
+        cout << user_float[ii] << " ";
+    }
+    cout << endl;
+    cout << "----------------------------------------------------------" << endl;
+}
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+//   0  1  2   3     4  5    6     7   8   9
+// store a scan with 10 dimensions
+
+GtPlusGadgetImageArray::GtPlusGadgetImageArray() 
+:   imageArray_(0)
+{
+    size_t ii;
+    for ( ii=0; ii<10; ii++ )
+    {
+        matrix_size[ii] = 0;
+    }
+}
+
+GtPlusGadgetImageArray::GtPlusGadgetImageArray(const GtPlusGadgetImageArray& imArray) : imageArray_(0) 
+{
+    this->copy(imArray);
+}
+
+GtPlusGadgetImageArray::GtPlusGadgetImageArray(int aSize[10])
+{
+    try
+    {
+        size_t ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        size_t len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GtPlusGadgetImageExt[len];
+        }
+    }
+    catch(...)
+    {
+        std::cout << "Failed in allocate imageArray_" << std::endl;
+    }
+}
+
+GtPlusGadgetImageArray::~GtPlusGadgetImageArray()
+{
+    if (imageArray_)
+    {
+        delete [] imageArray_;
+    }
+}
+
+void GtPlusGadgetImageArray::resize(int aSize[10])
+{
+    try
+    {
+        size_t ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = aSize[ii];
+        }
+
+        size_t len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( imageArray_ ) 
+        {
+            delete [] imageArray_;
+            imageArray_ = NULL;
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GtPlusGadgetImageExt[len];
+        }
+    }
+    catch(...)
+    {
+        std::cout << "Failed in resize GtPlusGadgetImageArray " << std::endl;
+    }
+}
+
+bool GtPlusGadgetImageArray::copy(const GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        if (imageArray_) delete [] imageArray_;
+
+        size_t ii;
+        for ( ii=0; ii<10; ii++ )
+        {
+            matrix_size[ii] = imageArray.matrix_size[ii];
+        }
+
+        size_t len = 1;
+        for ( ii=3; ii<10; ii++ )
+        {
+            len *= matrix_size[ii];
+        }
+
+        if ( len > 0 )
+        {
+            imageArray_ = new GtPlusGadgetImageExt[len];
+        }
+
+        for ( size_t i=0; i<len; i++ )
+        {
+            imageArray_[i] = imageArray.imageArray_[i];
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusGadgetImageArray::copy(const GtPlusGadgetImageArray& imageArray) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusGadgetImageArray::get_offset(int slc, int e2, int con, int phs, int rep, int set, int seg)
+{
+    int offset = seg*matrix_size[8]*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + set*matrix_size[7]*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + rep*matrix_size[6]*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + phs*matrix_size[5]*matrix_size[4]*matrix_size[3]
+                    + con*matrix_size[4]*matrix_size[3]
+                    + e2*matrix_size[3]
+                    + slc;
+    return offset;
+}
+
+// Slice E2 Con Phase Rep Set Seg
+void GtPlusGadgetImageArray::findDimIndex(Gadgetron::gtPlus::ISMRMRDDIM& dim, int& ind)
+{
+    switch (dim)
+    {
+        case Gadgetron::gtPlus::DIM_Slice:
+            ind = 3;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Encoding2:
+            ind = 4;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Contrast:
+            ind = 5;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Phase:
+            ind = 6;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Repetition:
+            ind = 7;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Set:
+            ind = 8;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Segment:
+            ind = 9;
+        break;
+
+        default:
+            ind = 0;
+    }
+
+    return;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArrayEqual(Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        int dimInd;
+        findDimIndex(dim, dimInd);
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value >= matrix_size[dimInd] );
+
+        size_t startInd[7];
+        size_t endInd[7];
+
+        for ( int d=Gadgetron::gtPlus::DIM_Slice; d<=Gadgetron::gtPlus::DIM_Segment; d++ )
+        {
+            if ( d == dim )
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = value;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = value+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = 0;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = matrix_size[d-Gadgetron::gtPlus::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusGadgetImageArray::extractGadgetImageArrayEqual(dim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArrayEqual(Gadgetron::gtPlus::ISMRMRDDIM& dim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        int dimInd1;
+        findDimIndex(dim1, dimInd1);
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value1 >= matrix_size[dimInd1] );
+
+
+        int dimInd2;
+        findDimIndex(dim2, dimInd2);
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value2 >= matrix_size[dimInd2] );
+
+        size_t startInd[7];
+        size_t endInd[7];
+
+        for ( int d=Gadgetron::gtPlus::DIM_Slice; d<=Gadgetron::gtPlus::DIM_Segment; d++ )
+        {
+            if ( d == dim1 )
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = value1;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = value1+1;
+            }
+            else if ( d == dim2 )
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = value2;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = value2+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = 0;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = matrix_size[d-Gadgetron::gtPlus::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusGadgetImageArray::extractGadgetImageArrayEqual(dim1, value1, dim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArrayLessEqual(Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        int dimInd;
+        findDimIndex(dim, dimInd);
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value >= matrix_size[dimInd] );
+
+        size_t startInd[7];
+        size_t endInd[7];
+
+        for ( int d=Gadgetron::gtPlus::DIM_Slice; d<=Gadgetron::gtPlus::DIM_Segment; d++ )
+        {
+            if ( d == dim )
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = 0;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = value+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = 0;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = matrix_size[d-Gadgetron::gtPlus::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusGadgetImageArray::extractGadgetImageArrayLessEqual(dim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+extractGadgetImageArray_Dim1LessEqual_Dim2Equal(Gadgetron::gtPlus::ISMRMRDDIM& dim1, size_t value1, 
+        Gadgetron::gtPlus::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        int dimInd1;
+        findDimIndex(dim1, dimInd1);
+
+        int dimInd2;
+        findDimIndex(dim2, dimInd2);
+
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value1 >= matrix_size[dimInd1] );
+        GADGET_DEBUG_CHECK_RETURN_FALSE( value2 >= matrix_size[dimInd2] );
+
+        size_t startInd[7];
+        size_t endInd[7];
+
+        for ( int d=Gadgetron::gtPlus::DIM_Slice; d<=Gadgetron::gtPlus::DIM_Segment; d++ )
+        {
+            if ( d == dim1 )
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = 0;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = value1+1;
+            }
+            else if ( d == dim2 )
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = value2;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = value2+1;
+            }
+            else
+            {
+                startInd[d-Gadgetron::gtPlus::DIM_Slice] = 0;
+                endInd[d-Gadgetron::gtPlus::DIM_Slice] = matrix_size[d-Gadgetron::gtPlus::DIM_Slice+3];
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(getSubImageArray(startInd, endInd, imageArray));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusGadgetImageArray::extractGadgetImageArray_Dim1LessEqual_Dim2Equal(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusGadgetImageArray::
+getSubImageArray(size_t* startInd, size_t* endInd, GtPlusGadgetImageArray& imageArray)
+{
+    try
+    {
+        int aSize[10];
+        aSize[0] = matrix_size[0];
+        aSize[1] = matrix_size[1];
+        aSize[2] = matrix_size[2];
+
+        size_t ii;
+        for ( ii=3; ii<10; ii++ )
+        {
+            aSize[ii] = endInd[ii-3]-startInd[ii-3];
+        }
+
+        imageArray.resize(aSize);
+
+        size_t slc, e2, con, phs, rep, set, seg;
+
+        for ( seg=startInd[6]; seg<endInd[6]; seg++ )
+        {
+            for ( set=startInd[5]; set<endInd[5]; set++ )
+            {
+                for ( rep=startInd[4]; rep<endInd[4]; rep++ )
+                {
+                    for ( phs=startInd[3]; phs<endInd[3]; phs++ )
+                    {
+                        for ( con=startInd[2]; con<endInd[2]; con++ )
+                        {
+                            for ( e2=startInd[1]; e2<endInd[1]; e2++ )
+                            {
+                                for ( slc=startInd[0]; slc<endInd[0]; slc++ )
+                                {
+                                    int offset = this->get_offset(slc, e2, con, phs, rep, set, seg);
+                                    int offsetDst= imageArray.get_offset(slc-startInd[0], e2-startInd[1], con-startInd[2], phs-startInd[3], rep-startInd[4], set-startInd[5], seg-startInd[6]);
+
+                                    imageArray.imageArray_[offsetDst] = imageArray_[offset];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusGadgetImageArray::getSubImageArray(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+void GtPlusGadgetImageArray::dump()
+{
+    size_t ii;
+    std::cout << "GtPlusGadgetImageArray" << std::endl;
+    std::cout << "==========================================================" << std::endl;
+    std::cout << "matrix_size           : ";
+    for ( ii=0; ii<10; ii++ )
+    {
+        std::cout << matrix_size[ii] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------------------" << std::endl;
+    if ( imageArray_ )
+    {
+        int slc, e2, con, phs, rep, set, seg;
+        for ( seg=0; seg<matrix_size[9]; seg++ )
+        {
+            for ( set=0; set<matrix_size[8]; set++ )
+            {
+                for ( rep=0; rep<matrix_size[7]; rep++ )
+                {
+                    for ( phs=0; phs<matrix_size[6]; phs++ )
+                    {
+                        for ( con=0; con<matrix_size[5]; con++ )
+                        {
+                            for ( e2=0; e2<matrix_size[4]; e2++ )
+                            {
+                                for ( slc=0; slc<matrix_size[3]; slc++ )
+                                {
+                                    int offset = get_offset(slc, e2, con, phs, rep, set, seg);
+                                    std::cout << "[Slice E2 Contrast Phase Rep Set Seg] = [" 
+                                                << " " << slc 
+                                                << " " << e2 
+                                                << " " << con 
+                                                << " " << phs 
+                                                << " " << rep 
+                                                << " " << set 
+                                                << " " << seg << "]" << std::endl;
+
+                                    imageArray_[offset].dump();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+    std::cout << "==========================================================" << std::endl;
+}
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetImageArray.h b/gadgets/gtPlus/GtPlusGadgetImageArray.h
new file mode 100644
index 0000000..9513e28
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetImageArray.h
@@ -0,0 +1,72 @@
+/** \file   GtPlusGadgetImageArray.h
+    \brief  The GtPlusGadgetImageArray is used by the triggering gadget to store the ISMRMRD ImageHeader information
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+// GtPlusGadgetImageArray stores the ISMRMRD image header info for every
+// 2D kspace
+
+namespace Gadgetron
+{
+
+struct  EXPORTGTPLUSGADGET GtPlusGadgetImageExt : public ISMRMRD::ImageHeader
+{
+    // fields added to store the time_stamp and pmu_time_stamp for every incoming read-out line
+    // if one line is not acquried, the corresponding time is -1
+    std::vector<int>     time_stamps;
+    std::vector<int>     pmu_time_stamps;
+
+    GtPlusGadgetImageExt();
+    ~GtPlusGadgetImageExt();
+
+    void copy(GtPlusGadgetImageExt& aMessageImage);
+    void set_matrix_size(size_t index, ACE_UINT16 size);
+
+    // interpolation is performed
+    // this = weight * this + (1-weight)*aMessageImage
+    void recomputeHeader(const GtPlusGadgetImageExt& aMessageImage, double weight);
+    void dump();
+}; 
+
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+//  0  1  2   3     4  5   6     7   8   9
+// store a scan with 10 dimensions
+struct  EXPORTGTPLUSGADGET GtPlusGadgetImageArray
+{
+    // size of the image array
+    ACE_UINT16 matrix_size[10];
+
+    // message information for every 2D image [RO E1 Cha Slice E2 Contrast Phase Rep Set Seg]
+    GtPlusGadgetImageExt* imageArray_;
+
+    GtPlusGadgetImageArray();
+    GtPlusGadgetImageArray(const GtPlusGadgetImageArray& imArray);
+    GtPlusGadgetImageArray(int aSize[10]);
+    ~GtPlusGadgetImageArray();
+
+    void findDimIndex(Gadgetron::gtPlus::ISMRMRDDIM& dim, int& ind);
+    bool getSubImageArray(size_t* startInd, size_t* endInd, GtPlusGadgetImageArray& imageArray);
+    void resize(int aSize[10]);
+    bool copy(const GtPlusGadgetImageArray& imageArray);
+    int get_offset(int slc, int e2, int con, int phs, int rep, int set, int seg);
+    bool extractGadgetImageArrayEqual(Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray);
+    bool extractGadgetImageArrayEqual(Gadgetron::gtPlus::ISMRMRDDIM& dim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray);
+    bool extractGadgetImageArrayLessEqual(Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t value, GtPlusGadgetImageArray& imageArray);
+    bool extractGadgetImageArray_Dim1LessEqual_Dim2Equal(Gadgetron::gtPlus::ISMRMRDDIM& dim1, size_t value1, Gadgetron::gtPlus::ISMRMRDDIM& dim2, size_t value2, GtPlusGadgetImageArray& imageArray);
+
+    void dump();
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetOpenMP.cpp b/gadgets/gtPlus/GtPlusGadgetOpenMP.cpp
new file mode 100644
index 0000000..5c0a2f1
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetOpenMP.cpp
@@ -0,0 +1,103 @@
+
+#include "GtPlusGadgetOpenMP.h"
+
+namespace Gadgetron
+{
+
+#ifdef USE_OMP
+
+bool prepOpenMP()
+{
+    try
+    {
+        GADGET_MSG("--> OpenMP info <--");
+        GADGET_MSG("--------------------------------------------------------");
+
+        int numOpenMPProcs = omp_get_num_procs();
+        GADGET_MSG("GtPlusRecon, numOpenMPProcs : " << numOpenMPProcs);
+
+        #ifndef WIN32
+            #ifndef GCC_OLD_FLAG
+                int maxOpenMPLevels = omp_get_max_active_levels();
+                GADGET_MSG("GtPlusRecon, maxOpenMPLevels : " << maxOpenMPLevels);
+            #endif // GCC_OLD_FLAG
+        #endif // WIN32
+
+        int maxOpenMPThreads = omp_get_max_threads();
+        GADGET_MSG("GtPlusRecon, maxOpenMPThreads : " << maxOpenMPThreads);
+
+        if ( numOpenMPProcs != maxOpenMPThreads )
+        {
+            GADGET_MSG("GtPlusRecon, numOpenMPProcs != maxOpenMPThreads , hyperthreading must be disabled ... ");
+            omp_set_num_threads(numOpenMPProcs);
+        }
+
+        omp_set_nested(1);
+        int allowOpenMPNested = omp_get_nested();
+        GADGET_MSG("GtPlusRecon, allowOpenMPNested : " << allowOpenMPNested);
+
+        #ifdef WIN32
+            GADGET_MSG("----------------------------------");
+            GADGET_MSG("GtPlus, set thread affinity ... ");
+
+            /// lock the threads
+            #pragma omp parallel default(shared)
+            {
+                int tid = omp_get_thread_num();
+                DWORD_PTR mask = (1 << tid);
+                GADGET_MSG("thread id : " << tid << " - mask : " << mask);
+                SetThreadAffinityMask( GetCurrentThread(), mask );
+            }
+        #endif // WIN32
+
+        GADGET_MSG("--------------------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlus prepOpenMP() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+#else
+
+bool prepOpenMP()
+{
+    return true;
+}
+
+#endif // USE_OMP
+
+#ifdef USE_MKL
+
+bool prepMKL()
+{
+    try
+    {
+        GADGET_MSG("--> MKL info <--");
+        GADGET_MSG("--------------------------------------------------------");
+        MKL_INT oldmode = vmlSetMode( VML_EP );
+        GADGET_MSG("GtPlus, set MKL vml precision to EP ... ");
+        GADGET_MSG("--------------------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlus prepMKL() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+#else
+
+bool prepMKL()
+{
+    return true;
+}
+
+#endif // USE_MKL
+
+}
diff --git a/gadgets/gtPlus/GtPlusGadgetOpenMP.h b/gadgets/gtPlus/GtPlusGadgetOpenMP.h
new file mode 100644
index 0000000..b63d796
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusGadgetOpenMP.h
@@ -0,0 +1,27 @@
+/** \file   GtPlusGadgetOpenMP.h
+    \brief  Pack up the OpenMP and MKL support in the GtPlus
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+bool EXPORTGTPLUSGADGET prepOpenMP();
+bool EXPORTGTPLUSGADGET prepMKL();
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h b/gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h
new file mode 100644
index 0000000..012edf0
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTCloudPackage.h
@@ -0,0 +1,227 @@
+/** \file   GtPlusRecon2DTCloudPackage.h
+    \brief  To support the dual layer GtPlus cloud, this cloud job type is defined here
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+            Magenetic Resonance in Medicine on Dec 2013.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "SerializableObject.h"
+
+namespace Gadgetron
+{
+
+struct EXPORTGTPLUSGADGET GtPlusRecon2DTPara
+{
+    size_t reconSizeRO_;
+    size_t reconSizeE1_;
+    size_t reconSizeE2_;
+
+    float encodingFOV_RO_;
+    float encodingFOV_E1_;
+    float encodingFOV_E2_;
+
+    float reconFOV_RO_;
+    float reconFOV_E1_;
+    float reconFOV_E2_;
+
+    Gadgetron::gtPlus::ISMRMRDDIM dim_4th_;
+    Gadgetron::gtPlus::ISMRMRDDIM dim_5th_;
+    Gadgetron::gtPlus::ISMRMRDDIM workOrder_ShareDim_;
+
+    bool no_acceleration_averageall_ref_;
+    int no_acceleration_ref_numOfModes_;
+    bool no_acceleration_same_combinationcoeff_allS_;
+    int no_acceleration_whichS_combinationcoeff_;
+
+    bool interleaved_same_combinationcoeff_allS_;
+    int interleaved_whichS_combinationcoeff_;
+    int interleaved_ref_numOfModes_;
+
+    bool embedded_averageall_ref_;
+    int embedded_ref_numOfModes_;
+    bool embedded_fullres_coilmap_;
+    bool embedded_fullres_coilmap_useHighestSignal_;
+    bool embedded_same_combinationcoeff_allS_;
+    int embedded_whichS_combinationcoeff_;
+    bool embedded_ref_fillback_;
+
+    bool separate_averageall_ref_;
+    int separate_ref_numOfModes_;
+    bool separate_fullres_coilmap_;
+    bool separate_same_combinationcoeff_allS_;
+    int separate_whichS_combinationcoeff_;
+
+    bool same_coil_compression_coeff_allS_;
+
+    bool recon_kspace_needed_;
+
+    Gadgetron::gtPlus::gtPlusReconWorkOrderPara workOrderPara_;
+};
+
+template <typename T> 
+struct GtPlusRecon2DTCloudPackage : public SerializableObject
+{
+    GtPlusRecon2DTPara para;
+
+    hoNDArray<T> kspace;
+    hoNDArray<T> ref;
+
+    hoNDArray<T> complexIm;
+    hoNDArray<T> res;
+
+    GtPlusRecon2DTCloudPackage();
+    GtPlusRecon2DTCloudPackage(const GtPlusRecon2DTCloudPackage& pack);
+
+    ~GtPlusRecon2DTCloudPackage();
+
+    GtPlusRecon2DTCloudPackage<T>& operator=(const GtPlusRecon2DTCloudPackage<T>& pack);
+
+    virtual bool serialize(char*& buf, size_t& len) const;
+    virtual bool deserialize(char* buf, size_t& len);
+};
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>::GtPlusRecon2DTCloudPackage()
+{
+    kspace.clear();
+    ref.clear();
+    complexIm.clear();
+    res.clear();
+}
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>::~GtPlusRecon2DTCloudPackage()
+{
+
+}
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>::GtPlusRecon2DTCloudPackage(const GtPlusRecon2DTCloudPackage& pack)
+{
+    para = pack.para;
+    kspace = pack.kspace;
+    ref = pack.ref;
+    complexIm = pack.complexIm;
+    res = pack.res;
+}
+
+template <typename T> 
+GtPlusRecon2DTCloudPackage<T>& GtPlusRecon2DTCloudPackage<T>::operator=(const GtPlusRecon2DTCloudPackage& pack)
+{
+    if ( this == &pack ) return *this;
+
+    para = pack.para;
+    kspace = pack.kspace;
+    ref = pack.ref;
+    complexIm = pack.complexIm;
+    res = pack.res;
+
+    return *this;
+}
+
+template <typename T> 
+bool GtPlusRecon2DTCloudPackage<T>::serialize(char*& buf, size_t& len) const
+{
+    char *bufKSpace(NULL), *bufRef(NULL), *bufComplexIm(NULL), *bufRes(NULL);
+    try
+    {
+        if ( buf != NULL ) delete[] buf;
+
+        // find the total len
+        size_t lenKSpace, lenRef, lenComplexIm, lenRes;
+
+        GADGET_CHECK_THROW(kspace.serialize(bufKSpace, lenKSpace));
+        GADGET_CHECK_THROW(kspace.serialize(bufRef, lenRef));
+        GADGET_CHECK_THROW(complexIm.serialize(bufComplexIm, lenComplexIm));
+        GADGET_CHECK_THROW(res.serialize(bufRes, lenRes));
+
+        // total length
+        len = sizeof(GtPlusRecon2DTPara) + lenKSpace + lenRef + lenComplexIm + lenRes;
+
+        buf = new char[len];
+        GADGET_CHECK_RETURN_FALSE( buf != NULL );
+
+        size_t offset = 0, currLen=0;
+
+        currLen = sizeof(GtPlusRecon2DTPara);
+        memcpy(buf+offset, &para, currLen);
+        offset += currLen;
+
+        currLen = lenKSpace;
+        memcpy(buf+offset, bufKSpace, currLen);
+        offset += currLen;
+        delete [] bufKSpace;
+
+        currLen = lenRef;
+        memcpy(buf+offset, bufRef, currLen);
+        offset += currLen;
+        delete [] bufRef;
+
+        currLen = lenComplexIm;
+        memcpy(buf+offset, bufComplexIm, currLen);
+        offset += currLen;
+        delete [] bufComplexIm;
+
+        currLen = lenRes;
+        memcpy(buf+offset, bufRes, currLen);
+        offset += currLen;
+        delete [] bufRes;
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors happened in GtPlusRecon2DTCloudPackage<T>::serialize(...) ... ");
+
+        if ( bufKSpace != NULL ) delete [] bufKSpace;
+        if ( bufRef != NULL ) delete [] bufRef;
+        if ( bufComplexIm != NULL ) delete [] bufComplexIm;
+        if ( bufRes != NULL ) delete [] bufRes;
+
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool GtPlusRecon2DTCloudPackage<T>::deserialize(char* buf, size_t& len)
+{
+    try
+    {
+        memcpy(&para, buf, sizeof(GtPlusRecon2DTPara));
+
+        size_t offset(sizeof(GtPlusRecon2DTPara)), currLen=0;
+
+        GADGET_CHECK_RETURN_FALSE(kspace.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(ref.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(complexIm.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(res.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        // total length
+        len = offset;
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors happended in GtPlusRecon2DTCloudPackage<T>::deserialize(...) ...");
+        return false;
+    }
+
+    return true;
+}
+
+typedef GtPlusRecon2DTCloudPackage< std::complex<float> > GtPlusRecon2DTCloudPackageCPFL;
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadget.cpp b/gadgets/gtPlus/GtPlusRecon2DTGadget.cpp
new file mode 100644
index 0000000..0beaac9
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadget.cpp
@@ -0,0 +1,427 @@
+
+#include "GtPlusRecon2DTGadget.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusRecon2DTGadget::GtPlusRecon2DTGadget() : BaseClass()
+{
+
+}
+
+GtPlusRecon2DTGadget::~GtPlusRecon2DTGadget()
+{
+
+}
+
+bool GtPlusRecon2DTGadget::readParameters()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(BaseClass::readParameters());
+
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlusRecon2DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("dim_4th");
+        para_.dim_4th_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "dim_4th_ is " << *str);
+
+        str = this->get_string_value("dim_5th");
+        para_.dim_5th_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "dim_5th_ is " << *str);
+
+        str = this->get_string_value("workOrder_ShareDim");
+        para_.workOrder_ShareDim_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "workOrder_ShareDim_ is " << *str);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.no_acceleration_averageall_ref_ = this->get_bool_value("no_acceleration_averageall_ref");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_averageall_ref_ is " << para_.no_acceleration_averageall_ref_);
+
+        para_.no_acceleration_ref_numOfModes_ = this->get_int_value("no_acceleration_ref_numOfModes");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_ref_numOfModes_ is " << para_.no_acceleration_ref_numOfModes_);
+
+        para_.no_acceleration_same_combinationcoeff_allS_ = this->get_bool_value("no_acceleration_same_combinationcoeff_allS");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_same_combinationcoeff_allS_ is " << para_.no_acceleration_same_combinationcoeff_allS_);
+
+        para_.no_acceleration_whichS_combinationcoeff_ = this->get_int_value("no_acceleration_whichS_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_whichS_combinationcoeff_ is " << para_.no_acceleration_whichS_combinationcoeff_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.interleaved_same_combinationcoeff_allS_ = this->get_bool_value("interleaved_same_combinationcoeff_allS");
+        GADGET_CONDITION_MSG(verboseMode_, "interleaved_same_combinationcoeff_allS_ is " << para_.interleaved_same_combinationcoeff_allS_);
+
+        para_.interleaved_ref_numOfModes_ = this->get_int_value("interleaved_ref_numOfModes");
+        GADGET_CONDITION_MSG(verboseMode_, "interleaved_ref_numOfModes_ is " << para_.interleaved_ref_numOfModes_);
+
+        para_.interleaved_whichS_combinationcoeff_ = this->get_int_value("interleaved_whichS_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "interleaved_whichS_combinationcoeff_ is " << para_.interleaved_whichS_combinationcoeff_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.embedded_averageall_ref_ = this->get_bool_value("embedded_averageall_ref");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_averageall_ref_ is " << para_.embedded_averageall_ref_);
+
+        para_.embedded_ref_numOfModes_ = this->get_int_value("embedded_ref_numOfModes");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_ref_numOfModes_ is " << para_.embedded_ref_numOfModes_);
+
+        para_.embedded_fullres_coilmap_ = this->get_bool_value("embedded_fullres_coilmap");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_fullres_coilmap_ is " << para_.embedded_fullres_coilmap_);
+
+        para_.embedded_fullres_coilmap_useHighestSignal_ = this->get_bool_value("embedded_fullres_coilmap_useHighestSignal");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_fullres_coilmap_useHighestSignal_ is " << para_.embedded_fullres_coilmap_useHighestSignal_);
+
+        para_.embedded_same_combinationcoeff_allS_ = this->get_bool_value("embedded_same_combinationcoeff_allS");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_same_combinationcoeff_allS_ is " << para_.embedded_same_combinationcoeff_allS_);
+
+        para_.embedded_whichS_combinationcoeff_ = this->get_int_value("embedded_whichS_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_whichS_combinationcoeff_ is " << para_.embedded_whichS_combinationcoeff_);
+
+        para_.embedded_ref_fillback_ = this->get_bool_value("embedded_ref_fillback");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_ref_fillback_ is " << para_.embedded_ref_fillback_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.separate_averageall_ref_ = this->get_bool_value("separate_averageall_ref");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_averageall_ref_ is " << para_.separate_averageall_ref_);
+
+        para_.separate_ref_numOfModes_ = this->get_int_value("separate_ref_numOfModes");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_ref_numOfModes_ is " << para_.separate_ref_numOfModes_);
+
+        para_.separate_fullres_coilmap_ = this->get_bool_value("separate_fullres_coilmap");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_fullres_coilmap_ is " << para_.separate_fullres_coilmap_);
+
+        para_.separate_same_combinationcoeff_allS_ = this->get_bool_value("separate_same_combinationcoeff_allS");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_same_combinationcoeff_allS_ is " << para_.separate_same_combinationcoeff_allS_);
+
+        para_.separate_whichS_combinationcoeff_ = this->get_int_value("separate_whichS_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_whichS_combinationcoeff_ is " << para_.separate_whichS_combinationcoeff_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.same_coil_compression_coeff_allS_ = this->get_bool_value("same_coil_compression_coeff_allS");
+        GADGET_CONDITION_MSG(verboseMode_, "same_coil_compression_coeff_allS_ is " << para_.same_coil_compression_coeff_allS_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        // get the parameters from base class
+        BaseClass::readParameters();
+
+        para_.recon_kspace_needed_ = recon_kspace_needed_;
+        para_.workOrderPara_ = workOrderPara_;
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusRecon2DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusRecon2DTGadget::setWorkOrder2DTParameters(WorkOrder2DTType* workOrder)
+{
+    workOrder->recon_kspace_needed_ = para_.recon_kspace_needed_;
+
+    if ( para_.workOrderPara_.coil_compression_thres_>0 || para_.workOrderPara_.coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = para_.same_coil_compression_coeff_allS_;
+
+    workOrder->embedded_averageall_ref_ = para_.embedded_averageall_ref_;
+    workOrder->embedded_ref_numOfModes_ = para_.embedded_ref_numOfModes_;
+    workOrder->embedded_fullres_coilmap_ = para_.embedded_fullres_coilmap_;
+    workOrder->embedded_fullres_coilmap_useHighestSignal_ = para_.embedded_fullres_coilmap_useHighestSignal_;
+    workOrder->embedded_same_combinationcoeff_allS_ = para_.embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = para_.embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = para_.embedded_ref_fillback_;
+
+    workOrder->separate_averageall_ref_ = para_.separate_averageall_ref_;
+    workOrder->separate_ref_numOfModes_ = para_.separate_ref_numOfModes_;
+    workOrder->separate_fullres_coilmap_ = para_.separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = para_.separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = para_.separate_whichS_combinationcoeff_;
+
+    workOrder->interleaved_same_combinationcoeff_allS_ = para_.interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = para_.interleaved_whichS_combinationcoeff_;
+    workOrder->interleaved_ref_numOfModes_ = para_.interleaved_ref_numOfModes_;
+
+    workOrder->no_acceleration_averageall_ref_ = para_.no_acceleration_averageall_ref_;
+    workOrder->no_acceleration_ref_numOfModes_ = para_.no_acceleration_ref_numOfModes_;
+    workOrder->no_acceleration_same_combinationcoeff_allS_ = para_.no_acceleration_same_combinationcoeff_allS_;
+    workOrder->no_acceleration_whichS_combinationcoeff_ = para_.no_acceleration_whichS_combinationcoeff_;
+
+    return true;
+}
+
+int GtPlusRecon2DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+    GADGET_CHECK_RETURN(BaseClass::process_config(mb)==GADGET_OK, GADGET_FAIL);
+
+    // pre-allocate memory
+    size_t numOfBytes = matrix_size_encoding_[0]*kSpaceMaxAcqE1No_*num_acq_channels_*num_acq_channels_*sizeof(ValueType);
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon2DTGadget::Pre allocate : " << numOfBytes/1024.0/1024.0 << " Megabytes ... ");
+
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Pre-allocate memory ... ", performTiming_);
+    mem_manager_->increase(numOfBytes);
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    worker_grappa_.gtPlus_mem_manager_ = mem_manager_;
+    worker_noacceleration_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_L1_ncg_.gtPlus_mem_manager_ = mem_manager_;
+
+    if ( CloudComputing_ )
+    {
+        bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+        if ( parseSuccess )
+        {
+            CloudSize_ = gt_cloud_.size();
+            if ( CloudSize_ == 0 ) CloudComputing_ = false;
+        }
+        else
+        {
+            CloudComputing_ = false;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusRecon2DTGadget::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon2DTGadget::process(...) starts ... ");
+
+    processed_called_times_++;
+
+    GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+    WorkOrderType* workOrder = m2->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    GADGET_CONDITION_MSG(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+    dimensions_ = *dims;
+
+    // fill in more parameters
+    para_.reconSizeRO_ = (*dims)[0];
+    para_.reconSizeE1_ = reconE1_;
+    para_.reconSizeE2_ = reconE2_;
+    para_.encodingFOV_RO_ = field_of_view_encoding_[0];
+    para_.encodingFOV_E1_ = field_of_view_encoding_[1];
+    para_.encodingFOV_E2_ = field_of_view_encoding_[2];
+    para_.reconFOV_RO_ = field_of_view_recon_[0];
+    para_.reconFOV_E1_ = field_of_view_recon_[1];
+    para_.reconFOV_E2_ = field_of_view_recon_[2];
+
+    para_.workOrderPara_.CalibMode_ = workOrder->CalibMode_;
+    para_.workOrderPara_.InterleaveDim_ = workOrder->InterleaveDim_;
+
+    para_.workOrderPara_.acceFactorE1_ = workOrder->acceFactorE1_;
+    para_.workOrderPara_.acceFactorE2_ = workOrder->acceFactorE2_;
+
+    para_.workOrderPara_.kSpaceCenterRO_ = workOrder->kSpaceCenterRO_;
+    para_.workOrderPara_.kSpaceCenterEncode1_ = workOrder->kSpaceCenterEncode1_;
+    para_.workOrderPara_.kSpaceCenterEncode2_ = workOrder->kSpaceCenterEncode2_;
+
+    para_.workOrderPara_.kSpaceMaxRO_ = workOrder->kSpaceMaxRO_;
+    para_.workOrderPara_.kSpaceMaxEncode1_ = workOrder->kSpaceMaxEncode1_;
+    para_.workOrderPara_.kSpaceMaxEncode2_ = workOrder->kSpaceMaxEncode2_;
+
+    para_.workOrderPara_.start_RO_ = workOrder->start_RO_;
+    para_.workOrderPara_.end_RO_ = workOrder->end_RO_;
+
+    para_.workOrderPara_.start_E1_ = workOrder->start_E1_;
+    para_.workOrderPara_.end_E1_ = workOrder->end_E1_;
+
+    para_.workOrderPara_.start_E2_ = workOrder->start_E2_;
+    para_.workOrderPara_.end_E2_ = workOrder->end_E2_;
+
+    para_.workOrderPara_.workFlow_BufferKernel_ = workOrder->workFlow_BufferKernel_;
+    para_.workOrderPara_.workFlow_use_BufferedKernel_ = workOrder->workFlow_use_BufferedKernel_;
+    para_.workOrderPara_.num_channels_res_ = workOrder->num_channels_res_;
+
+    // ---------------------------------------------------------
+    // set the work flow
+    // ---------------------------------------------------------
+    workflow_.reconSizeRO_ = para_.reconSizeRO_;
+    workflow_.reconSizeE1_ = para_.reconSizeE1_;
+    workflow_.reconSizeE2_ = para_.reconSizeE2_;
+    workflow_.encodingFOV_RO_ = para_.encodingFOV_RO_;
+    workflow_.encodingFOV_E1_ = para_.encodingFOV_E1_;
+    workflow_.encodingFOV_E2_ = para_.encodingFOV_E2_;
+    workflow_.reconFOV_RO_ = para_.reconFOV_RO_;
+    workflow_.reconFOV_E1_ = para_.reconFOV_E1_;
+    workflow_.reconFOV_E2_ = para_.reconFOV_E2_;
+
+    workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = para_.dim_4th_;
+    workflow_.dim5th_ = para_.dim_5th_;
+    workflow_.WorkOrderShareDim_ = para_.workOrder_ShareDim_;
+    workflow_.performTiming_ = performTiming_;
+
+    // ---------------------------------------------------------
+    // set work order
+    // ---------------------------------------------------------
+    workOrder->copyFromPara(para_.workOrderPara_);
+
+    workOrder->CloudComputing_ = CloudComputing_;
+    workOrder->CloudSize_ = CloudSize_;
+    workOrder->gt_cloud_ = gt_cloud_;
+
+    // ---------------------------------------------------------
+    // set the worker
+    // ---------------------------------------------------------
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+    // if 'other' data is coming in
+    if ( workOrder->other_.get_number_of_elements() > 0 )
+    {
+        workOrder->duplicate(workOrder_recon_other_);
+        setWorkOrder2DTParameters(&workOrder_recon_other_);
+        workflow_.workOrder_ = &workOrder_recon_other_;
+
+        // perform a simple FFT recon
+        workOrder_recon_other_.acceFactorE1_ = 1;
+        workOrder_recon_other_.acceFactorE2_ = 1;
+
+        workOrder_recon_other_.start_RO_ = -1;
+        workOrder_recon_other_.end_RO_ = -1;
+        workOrder_recon_other_.start_E1_ = -1;
+        workOrder_recon_other_.end_E1_ = -1;
+        workOrder_recon_other_.start_E2_ = -1;
+        workOrder_recon_other_.end_E2_ = -1;
+
+        workflow_.worker_ = &worker_noacceleration_;
+        workflow_.setDataArray(workOrder->other_);
+        GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+
+        //hoNDArray<ValueType> resResized;
+        //GADGET_CHECK_RETURN(gtPlus_util_complex_.zpadResize2D(workflow_.res_, workflow_.reconSizeRO_, workflow_.reconSizeE1_, resResized), GADGET_FAIL);
+        //GADGET_CHECK_RETURN(this->sendOutRecon(images, resResized, image_series_+1, workOrder->dataDimStartingIndexes_, "Other"), GADGET_FAIL);
+
+       GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_+1, workOrder->dataDimStartingIndexes_, "Other"), GADGET_FAIL);
+
+        workflow_.res_.clear();
+        workflow_.data_ = NULL;
+        workflow_.ref_ = NULL;
+        workflow_.workOrder_ = NULL;
+
+        workOrder_recon_other_.reset();
+    }
+
+    // perform the recon
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Recon 2DT workorder ... ", performTiming_);
+
+    GADGET_CHECK_RETURN(this->generateKSpaceFilter(*workOrder), GADGET_FAIL);
+
+    workOrder->duplicate(workOrder_recon_);
+    setWorkOrder2DTParameters(&workOrder_recon_);
+
+    workflow_.workOrder_ = &workOrder_recon_;
+    if ( verboseMode_ )
+    {
+        workflow_.workOrder_->print(std::cout);
+    }
+
+    workflow_.setDataArray(workOrder->data_);
+
+    if ( workOrder->ref_.get_number_of_elements() > 0 )
+    {
+        workflow_.setRefArray(workOrder->ref_);
+    }
+    else if ( CalibMode_==Gadgetron::gtPlus::ISMRMRD_interleaved )
+    {
+        workOrder->ref_ = workOrder->data_;
+        workflow_.setRefArray(workOrder->ref_);
+    }
+
+    // set the work flow for worker and workOrder
+    if ( workOrder->acceFactorE1_ > 1 )
+    {
+        if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_;
+        }
+        else if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_L1SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_L1_ncg_;
+        }
+        else
+        {
+            workflow_.worker_ = &worker_grappa_;
+        }
+    }
+    else
+    {
+        workflow_.worker_ = &worker_noacceleration_;
+    }
+
+    GADGET_CHECK_RETURN(workflow_.preProcessing(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.postProcessing(), GADGET_FAIL);
+
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "Recon2DT_" << processed_called_times_;
+
+        hoNDArray<GT_Complex8> res = workflow_.res_;
+        res.squeeze();
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, res, ostr.str());
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_, workOrder->dataDimStartingIndexes_, "Image"), GADGET_FAIL);
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon2DTGadget::process(...) ends ... ");
+
+    // reset the status
+    workflow_.data_ = NULL;
+    workflow_.ref_ = NULL;
+    workflow_.noise_ = NULL;
+    workflow_.workOrder_ = NULL;
+    // Gadgetron::clear(&workflow_.res_);
+
+    m1->release();
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusRecon2DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadget.h b/gadgets/gtPlus/GtPlusRecon2DTGadget.h
new file mode 100644
index 0000000..4bc36c6
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadget.h
@@ -0,0 +1,63 @@
+/** \file   GtPlusRecon2DTGadget.h
+    \brief  This gadget encapsulates the reconstruction for 2DT cases.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusReconGadget.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+#include "GtPlusRecon2DTCloudPackage.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusRecon2DTGadget : public GtPlusReconGadget
+{
+public:
+    GADGET_DECLARE(GtPlusRecon2DTGadget);
+
+    typedef GtPlusReconGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+
+    typedef BaseClass::WorkOrderType WorkOrderType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrder2DTType;
+
+    typedef BaseClass::DimensionRecordType DimensionRecordType;
+
+    GtPlusRecon2DTGadget();
+    ~GtPlusRecon2DTGadget();
+
+    GtPlusRecon2DTPara para_;
+
+protected:
+
+    virtual bool readParameters();
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    // set 2DT specific work order parameters
+    bool setWorkOrder2DTParameters(WorkOrder2DTType* workOrder);
+
+    // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    // workOrder for recon
+    WorkOrder2DTType workOrder_recon_;
+
+    // workOrder for recon 'other' data
+    WorkOrder2DTType workOrder_recon_other_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp
new file mode 100644
index 0000000..bf1f7eb
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.cpp
@@ -0,0 +1,506 @@
+
+#include "GtPlusRecon2DTGadgetCloud.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusRecon2DTGadgetCloud::GtPlusRecon2DTGadgetCloud() : BaseClass(), curr_node_(0), num_of_jobs_(0)
+{
+    packages_sent_.resize(1024);
+    packages_received_.resize(1024);
+    packages_passed_to_next_gadget_.resize(1024);
+    gt_timer_2DT_cloud_.set_timing_in_destruction(false);
+}
+
+GtPlusRecon2DTGadgetCloud::~GtPlusRecon2DTGadgetCloud()
+{
+
+}
+
+int GtPlusRecon2DTGadgetCloud::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+    GADGET_CHECK_RETURN(BaseClass::process_config(mb)==GADGET_OK, GADGET_FAIL);
+
+    if ( CloudComputing_ )
+    {
+        bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+        if ( parseSuccess )
+        {
+            CloudSize_ = gt_cloud_.size();
+            if ( CloudSize_ == 0 ) CloudComputing_ = false;
+        }
+
+        if ( CloudComputing_ )
+        {
+            // set up the cloud
+            if (controller_.open () == -1)
+            {
+                GADGET_ERROR_MSG("Cloud controller cannot open the cloud ...");
+                controller_.handle_close (ACE_INVALID_HANDLE, 0);
+                CloudComputing_ = false;
+            }
+            else
+            {
+                readers_.resize(CloudSize_, NULL);
+                writers_.resize(CloudSize_, NULL);
+
+                unsigned int j;
+                for ( j=0; j<CloudSize_; j++ )
+                {
+                    readers_[j] = new GtPlus2DTGadgetCloudJobMessageReaderCPFL();
+                    writers_[j] = new GtPlus2DTGadgetCloudJobMessageWriterCPFL();
+                }
+
+                if ( controller_.createConnector(gt_cloud_, GADGET_MESSAGE_GADGETCLOUD_JOB, readers_, GADGET_MESSAGE_GADGETCLOUD_JOB, writers_) != 0 )
+                {
+                    GADGET_ERROR_MSG("Cloud controller_ creates connectors failed ...");
+                    controller_.handle_close (ACE_INVALID_HANDLE, 0);
+                    CloudComputing_ = false;
+                }
+                else if ( controller_.connectToCloud(gt_cloud_) != 0 )
+                {
+                    GADGET_ERROR_MSG("Cloud controller_ cannot connect to the cloud ...");
+                    controller_.handle_close (ACE_INVALID_HANDLE, 0);
+                    CloudComputing_ = false;
+                }
+            }
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusRecon2DTGadgetCloud::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon2DTGadgetCloud::process(...) starts ... ");
+
+    processed_called_times_++;
+
+    // start a gadget level timer
+    if ( processed_called_times_ == 1 )
+    {
+        GADGET_START_TIMING(gt_timer_2DT_cloud_, "GtPlusRecon2DTGadgetCloud::process(...) gadegt level timer ... ");
+    }
+
+    // send out the package to current node
+    if ( CloudComputing_ )
+    {
+        GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+        WorkOrderType* workOrder = m2->getObjectPtr();
+
+        boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+        GADGET_CONDITION_MSG(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+            << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+            << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+        dimensions_ = *dims;
+
+        // fill in more parameters
+        para_.reconSizeRO_ = (*dims)[0];
+        para_.reconSizeE1_ = reconE1_;
+        para_.reconSizeE2_ = reconE2_;
+        para_.encodingFOV_RO_ = field_of_view_encoding_[0];
+        para_.encodingFOV_E1_ = field_of_view_encoding_[1];
+        para_.encodingFOV_E2_ = field_of_view_encoding_[2];
+        para_.reconFOV_RO_ = field_of_view_recon_[0];
+        para_.reconFOV_E1_ = field_of_view_recon_[1];
+        para_.reconFOV_E2_ = field_of_view_recon_[2];
+
+        para_.workOrderPara_.CalibMode_ = workOrder->CalibMode_;
+        para_.workOrderPara_.InterleaveDim_ = workOrder->InterleaveDim_;
+
+        para_.workOrderPara_.acceFactorE1_ = workOrder->acceFactorE1_;
+        para_.workOrderPara_.acceFactorE2_ = workOrder->acceFactorE2_;
+
+        para_.workOrderPara_.kSpaceCenterRO_ = workOrder->kSpaceCenterRO_;
+        para_.workOrderPara_.kSpaceCenterEncode1_ = workOrder->kSpaceCenterEncode1_;
+        para_.workOrderPara_.kSpaceCenterEncode2_ = workOrder->kSpaceCenterEncode2_;
+
+        para_.workOrderPara_.kSpaceMaxRO_ = workOrder->kSpaceMaxRO_;
+        para_.workOrderPara_.kSpaceMaxEncode1_ = workOrder->kSpaceMaxEncode1_;
+        para_.workOrderPara_.kSpaceMaxEncode2_ = workOrder->kSpaceMaxEncode2_;
+
+        para_.workOrderPara_.start_RO_ = workOrder->start_RO_;
+        para_.workOrderPara_.end_RO_ = workOrder->end_RO_;
+
+        para_.workOrderPara_.start_E1_ = workOrder->start_E1_;
+        para_.workOrderPara_.end_E1_ = workOrder->end_E1_;
+
+        para_.workOrderPara_.start_E2_ = workOrder->start_E2_;
+        para_.workOrderPara_.end_E2_ = workOrder->end_E2_;
+
+        para_.workOrderPara_.workFlow_BufferKernel_ = workOrder->workFlow_BufferKernel_;
+        para_.workOrderPara_.workFlow_use_BufferedKernel_ = workOrder->workFlow_use_BufferedKernel_;
+        para_.workOrderPara_.num_channels_res_ = workOrder->num_channels_res_;
+
+        // set up a cloud package
+        CloudPackageType package;
+        package.para = para_;
+
+        packages_sent_[num_of_jobs_] = package;
+        packages_sent_[num_of_jobs_].kspace = workOrder->data_;
+
+        packages_received_[num_of_jobs_] = package;
+
+        packages_passed_to_next_gadget_[num_of_jobs_].first = num_of_jobs_;
+        packages_passed_to_next_gadget_[num_of_jobs_].second = false;
+
+        // store image headers
+        GtPlusGadgetImageArray imArray;
+        image_headers_.push_back(imArray);
+        image_headers_[image_headers_.size()-1].copy(*images);
+
+        // send the package to current node
+        std::vector<CloudPackageType* > jobListCloud(1);
+        std::vector<CloudPackageType* > completedJobListCloud(1);
+        std::vector<int> node_ids(1, curr_node_);
+
+        jobListCloud[0] = &packages_sent_[num_of_jobs_];
+        completedJobListCloud[0] = &packages_received_[num_of_jobs_];
+
+        // set the data and ref arrays
+        jobListCloud[0]->kspace = workOrder->data_;
+        if ( workOrder->ref_.get_number_of_elements() > 0 )
+        {
+            jobListCloud[0]->ref = workOrder->ref_;
+        }
+        else if ( CalibMode_==Gadgetron::gtPlus::ISMRMRD_interleaved )
+        {
+            jobListCloud[0]->ref = workOrder->data_;
+        }
+
+        num_of_jobs_++;
+
+        if ( controller_.runJobsOnCloud(jobListCloud, completedJobListCloud, node_ids) != 0 )
+        {
+            GADGET_ERROR_MSG("Cloud controller runs jobs on the cloud failed ...");
+            controller_.handle_close (ACE_INVALID_HANDLE, 0);
+
+            // run locally
+            int retval = BaseClass::process(m1, m2);
+            packages_passed_to_next_gadget_[num_of_jobs_].second = true;
+
+            return retval;
+        }
+
+        curr_node_++;
+        if ( curr_node_ >= CloudSize_ ) curr_node_ = 0;
+
+        m1->release();
+    }
+    else
+    {
+        return BaseClass::process(m1, m2);
+    }
+
+    return GADGET_OK;
+}
+
+bool GtPlusRecon2DTGadgetCloud::processJob(CloudPackageType& jobSent, CloudPackageType& jobReceived)
+{
+    try
+    {
+        GtPlusRecon2DTCloudPackageCPFL* job = &jobSent;
+
+        boost::shared_ptr< std::vector<size_t> > dims = job->kspace.get_dimensions();
+
+        GADGET_CONDITION_MSG(verboseMode_, "job array size : [Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+            << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+            << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+        GtPlusRecon2DTPara& para = job->para;
+
+        // ---------------------------------------------------------
+        // set the work flow
+        // ---------------------------------------------------------
+        workflow_.reconSizeRO_ = para.reconSizeRO_;
+        workflow_.reconSizeE1_ = para.reconSizeE1_;
+        workflow_.reconSizeE2_ = para.reconSizeE2_;
+        workflow_.encodingFOV_RO_ = para.encodingFOV_RO_;
+        workflow_.encodingFOV_E1_ = para.encodingFOV_E1_;
+        workflow_.encodingFOV_E2_ = para.encodingFOV_E2_;
+        workflow_.reconFOV_RO_ = para.reconFOV_RO_;
+        workflow_.reconFOV_E1_ = para.reconFOV_E1_;
+        workflow_.reconFOV_E2_ = para.reconFOV_E2_;
+
+        // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+        workflow_.dim4th_ = para.dim_4th_;
+        workflow_.dim5th_ = para.dim_5th_;
+        workflow_.WorkOrderShareDim_ = para.workOrder_ShareDim_;
+        workflow_.performTiming_ = performTiming_;
+
+        // ---------------------------------------------------------
+        // set work order
+        // ---------------------------------------------------------
+        WorkOrder2DTType workOrder;
+
+        workOrder.copyFromPara(para.workOrderPara_);
+
+        workOrder.CloudComputing_ = CloudComputing_;
+        workOrder.CloudSize_ = CloudSize_;
+        workOrder.gt_cloud_ = gt_cloud_;
+
+        workOrder.data_ = job->kspace;
+        workOrder.ref_ = job->ref;
+
+        // ---------------------------------------------------------
+        // set the worker
+        // ---------------------------------------------------------
+        worker_grappa_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_noacceleration_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_spirit_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+        if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+        // set the worker
+        worker_grappa_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_noacceleration_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_spirit_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+        worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+        if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+        if ( verboseMode_ )
+        {
+            workOrder.print(std::cout);
+        }
+
+        // perform the recon
+        GADGET_START_TIMING_CONDITION(gt_timer1_, "Recon 2DT workorder on master node ... ", performTiming_);
+
+        GADGET_CHECK_RETURN(this->generateKSpaceFilter(workOrder), GADGET_FAIL);
+
+        workOrder.duplicate(workOrder_recon_);
+        this->setWorkOrder2DTParameters(&workOrder_recon_);
+
+        workflow_.workOrder_ = &workOrder_recon_;
+        if ( verboseMode_ )
+        {
+            workflow_.workOrder_->print(std::cout);
+        }
+
+        workflow_.setDataArray(workOrder.data_);
+
+        if ( workOrder.ref_.get_number_of_elements() > 0 )
+        {
+            workflow_.setRefArray(workOrder.ref_);
+        }
+        else if ( para.workOrderPara_.CalibMode_==Gadgetron::gtPlus::ISMRMRD_interleaved )
+        {
+            workOrder.ref_ = workOrder.data_;
+            workflow_.setRefArray(workOrder.ref_);
+        }
+
+        // set the work flow for worker and workOrder
+        if ( workOrder.acceFactorE1_ > 1 )
+        {
+            if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_SPIRIT )
+            {
+                workflow_.worker_ = &worker_spirit_;
+            }
+            else if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_L1SPIRIT )
+            {
+                workflow_.worker_ = &worker_spirit_L1_ncg_;
+            }
+            else
+            {
+                workflow_.worker_ = &worker_grappa_;
+            }
+        }
+        else
+        {
+            workflow_.worker_ = &worker_noacceleration_;
+        }
+
+        bool succeed = true;
+        succeed = workflow_.preProcessing();
+        if ( succeed )
+        {
+            succeed = workflow_.recon();
+            if ( succeed )
+            {
+                succeed = workflow_.postProcessing();
+            }
+        }
+
+        GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+        if ( !debugFolder_fullPath_.empty() )
+        {
+            std::ostringstream ostr;
+            ostr << "Recon2DT";
+
+            hoNDArray<GT_Complex8> res = workflow_.res_;
+            res.squeeze();
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, res, ostr.str());
+        }
+
+        if ( succeed )
+        {
+            jobReceived.complexIm = workflow_.res_;
+        }
+        else
+        {
+            jobReceived.complexIm.clear();
+            jobReceived.res.clear();
+        }
+
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon2DTGadgetCloud::process(...) ends ... ");
+
+        // reset the status
+        workflow_.data_ = NULL;
+        workflow_.ref_ = NULL;
+        workflow_.noise_ = NULL;
+        workflow_.workOrder_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in GtPlusRecon2DTGadgetCloud::processJob(CloudPackageType& jobSent, CloudPackageType& jobReceived) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusRecon2DTGadgetCloud::close(unsigned long flags)
+{
+    GADGET_CONDITION_MSG(true, "GtPlusRecon2DTGadgetCloud - close(flags) : " << flags);
+
+    if ( BaseClass::close(flags) != GADGET_OK ) return GADGET_FAIL;
+
+    if ( flags!=0 )
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon2DTGadgetCloud number of total jobs : " << num_of_jobs_ << " ... ");
+
+        if ( CloudComputing_ )
+        {
+            controller_.closeCloudNode();
+
+            // register a job handler
+            GtPlusRecon2DTGadgetCloudSender gadgetJobHandler;
+            gadgetJobHandler.gadget_ = this;
+            controller_.job_handler_ = &gadgetJobHandler;
+
+            controller_.waitForJobToComplete();
+
+            // if some jobs are not completed successfully, reprocess them; otherwise, send out images
+            std::vector<DimensionRecordType> dataDimStartingIndexes;
+            unsigned int N = image_headers_.size();
+            unsigned int ii;
+            for ( ii=0; ii<N; ii++ )
+            {
+                bool jobIsOk = true;
+                if ( (packages_received_[ii].complexIm.get_number_of_elements() == 0) && (packages_received_[ii].res.get_number_of_elements() == 0) )
+                {
+                    // if the cloud goes wrong, do not try again
+                    CloudComputing_ = false;
+                    jobIsOk = this->processJob(packages_sent_[ii], packages_received_[ii]);
+                }
+
+                if ( jobIsOk )
+                {
+                    if ( !packages_passed_to_next_gadget_[ii].second )
+                    {
+                        GADGET_CHECK_RETURN(this->sendOutRecon(&image_headers_[ii], packages_received_[ii].complexIm, image_series_, dataDimStartingIndexes, "Image"), GADGET_FAIL);
+                    }
+                }
+
+                if ( !debugFolder2_fullPath_.empty() )
+                {
+                    std::ostringstream ostr;
+                    ostr << "GadgetCloud_Recon2DT_" << ii;
+
+                    hoNDArray<GT_Complex8> res = packages_received_[ii].complexIm;
+                    res.squeeze();
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, res, ostr.str());
+                }
+            }
+        }
+
+        GADGET_STOP_TIMING(gt_timer_2DT_cloud_);
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusRecon2DTGadgetCloud)
+
+// -------------------------------------------------------------------------------------------
+// GtPlusRecon2DTGadgetCloudSender
+// -------------------------------------------------------------------------------------------
+
+GtPlusRecon2DTGadgetCloudSender::GtPlusRecon2DTGadgetCloudSender()
+{
+}
+
+GtPlusRecon2DTGadgetCloudSender::~GtPlusRecon2DTGadgetCloudSender()
+{
+}
+
+bool GtPlusRecon2DTGadgetCloudSender::processJob(int jobID, GtPlusRecon2DTCloudPackage< std::complex<float> >& ajob)
+{
+    try
+    {
+        bool jobIsOk = true;
+        if ( (gadget_->packages_received_[jobID].complexIm.get_number_of_elements() == 0) 
+            && (gadget_->packages_received_[jobID].res.get_number_of_elements() == 0) )
+        {
+            jobIsOk = false;
+            return true;
+        }
+
+        if ( jobIsOk )
+        {
+            std::vector<DimensionRecordType> dataDimStartingIndexes;
+
+            if ( !gadget_->packages_passed_to_next_gadget_[jobID].second )
+            {
+                gadget_->packages_passed_to_next_gadget_[jobID].second = true;
+                GADGET_CHECK_RETURN(gadget_->sendOutRecon(&gadget_->image_headers_[jobID], 
+                    gadget_->packages_received_[jobID].complexIm, gadget_->image_series_, dataDimStartingIndexes, "Image"), false);
+
+                if ( !gadget_->debugFolder2_fullPath_.empty() )
+                {
+                    std::ostringstream ostr;
+                    ostr << "Recon2DT_" << jobID;
+
+                    hoNDArray<GT_Complex8> res = gadget_->packages_received_[jobID].complexIm;
+                    res.squeeze();
+                    GADGET_EXPORT_ARRAY_COMPLEX(gadget_->debugFolder2_fullPath_, gadget_->gt_exporter_, res, ostr.str());
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_DEBUG1("GtPlusRecon2DTGadgetCloudSender handling close...\n");
+        return false;
+    }
+
+    return true;
+}
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h
new file mode 100644
index 0000000..cbeb0f7
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon2DTGadgetCloud.h
@@ -0,0 +1,91 @@
+/** \file   GtPlusRecon2DTGadgetCloud.h
+    \brief  This is the gateway gadget for the dual layer GtPlus cloud.
+            For every incoming k-space data package, it is sent to a first layer gadget.
+            If a data package was not processed successfully and results were not returned to this gadget,
+            the reconstruction will be performed locally.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+            Magenetic Resonance in Medicine on Dec 2013.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusRecon2DTGadget.h"
+#include "GadgetCloudController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GtPlusRecon2DTCloudPackage.h"
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusRecon2DTGadgetCloud : public GtPlusRecon2DTGadget
+{
+public:
+    GADGET_DECLARE(GtPlusRecon2DTGadgetCloud);
+
+    typedef GtPlusRecon2DTGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+    typedef BaseClass::WorkOrderType WorkOrderType;
+    typedef BaseClass::WorkOrder2DTType WorkOrder2DTType;
+    typedef BaseClass::DimensionRecordType DimensionRecordType;
+
+    typedef GtPlusRecon2DTCloudPackage<ValueType> CloudPackageType;
+
+    typedef Gadgetron::GadgetCloudController< CloudPackageType > GTCloudControllerType;
+
+    GtPlusRecon2DTGadgetCloud();
+    ~GtPlusRecon2DTGadgetCloud();
+
+    virtual int close(unsigned long flags);
+
+    std::vector<CloudPackageType> packages_sent_;
+    std::vector<CloudPackageType> packages_received_;
+
+    // indicate whether the results of all sent packages have been passed to next gadget or not
+    std::vector< std::pair<unsigned int, bool> >  packages_passed_to_next_gadget_;
+
+    // store the image headers for every incoming package
+    std::vector<GtPlusGadgetImageArray> image_headers_;
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    virtual bool processJob(CloudPackageType& jobSent, CloudPackageType& jobReceived);
+
+    GTCloudControllerType controller_;
+
+    unsigned int curr_node_;
+
+    unsigned int num_of_jobs_;
+
+    std::vector<GadgetMessageReader*> readers_;
+    std::vector<GadgetMessageWriter*> writers_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer_2DT_cloud_;
+};
+
+class GtPlusRecon2DTGadgetCloudSender : public GadgetCloudJobProcessHandler< GtPlusRecon2DTCloudPackage< std::complex<float> > >
+{
+public:
+
+    typedef std::pair<Gadgetron::gtPlus::ISMRMRDDIM, size_t> DimensionRecordType;
+
+    GtPlusRecon2DTGadgetCloudSender();
+    virtual ~GtPlusRecon2DTGadgetCloudSender();
+
+    virtual bool processJob(int jobID, GtPlusRecon2DTCloudPackage< std::complex<float> >& ajob);
+
+    // pointer to the gadget
+    GtPlusRecon2DTGadgetCloud* gadget_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon3DTGadget.cpp b/gadgets/gtPlus/GtPlusRecon3DTGadget.cpp
new file mode 100644
index 0000000..f0a0014
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon3DTGadget.cpp
@@ -0,0 +1,436 @@
+
+#include "GtPlusRecon3DTGadget.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusRecon3DTGadget::GtPlusRecon3DTGadget() : BaseClass()
+{
+
+}
+
+GtPlusRecon3DTGadget::~GtPlusRecon3DTGadget()
+{
+
+}
+
+bool GtPlusRecon3DTGadget::readParameters()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(BaseClass::readParameters());
+
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlusRecon3DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("dim_5th");
+        para_.dim_5th_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "dim_5th_ is " << *str);
+
+        str = this->get_string_value("workOrder_ShareDim");
+        para_.workOrder_ShareDim_ = gtPlus_util_.getISMRMRDDimFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "workOrder_ShareDim_ is " << *str);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.no_acceleration_averageall_ref_ = this->get_bool_value("no_acceleration_averageall_ref");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_averageall_ref_ is " << para_.no_acceleration_averageall_ref_);
+
+        para_.no_acceleration_same_combinationcoeff_allN_ = this->get_bool_value("no_acceleration_same_combinationcoeff_allN");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_same_combinationcoeff_allN_ is " << para_.no_acceleration_same_combinationcoeff_allN_);
+
+        para_.no_acceleration_whichN_combinationcoeff_ = this->get_int_value("no_acceleration_whichN_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "no_acceleration_whichN_combinationcoeff_ is " << para_.no_acceleration_whichN_combinationcoeff_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.interleaved_same_combinationcoeff_allN_ = this->get_bool_value("interleaved_same_combinationcoeff_allN");
+        GADGET_CONDITION_MSG(verboseMode_, "interleaved_same_combinationcoeff_allN_ is " << para_.interleaved_same_combinationcoeff_allN_);
+
+        para_.interleaved_whichN_combinationcoeff_ = this->get_int_value("interleaved_whichN_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "interleaved_whichN_combinationcoeff_ is " << para_.interleaved_whichN_combinationcoeff_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.embedded_averageall_ref_ = this->get_bool_value("embedded_averageall_ref");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_averageall_ref_ is " << para_.embedded_averageall_ref_);
+
+        para_.embedded_fullres_coilmap_ = this->get_bool_value("embedded_fullres_coilmap");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_fullres_coilmap_ is " << para_.embedded_fullres_coilmap_);
+
+        para_.embedded_same_combinationcoeff_allN_ = this->get_bool_value("embedded_same_combinationcoeff_allN");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_same_combinationcoeff_allN_ is " << para_.embedded_same_combinationcoeff_allN_);
+
+        para_.embedded_whichN_combinationcoeff_ = this->get_int_value("embedded_whichN_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_whichN_combinationcoeff_ is " << para_.embedded_whichN_combinationcoeff_);
+
+        para_.embedded_ref_fillback_ = this->get_bool_value("embedded_ref_fillback");
+        GADGET_CONDITION_MSG(verboseMode_, "embedded_ref_fillback_ is " << para_.embedded_ref_fillback_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.separate_averageall_ref_ = this->get_bool_value("separate_averageall_ref");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_averageall_ref_ is " << para_.separate_averageall_ref_);
+
+        para_.separate_fullres_coilmap_ = this->get_bool_value("separate_fullres_coilmap");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_fullres_coilmap_ is " << para_.separate_fullres_coilmap_);
+
+        para_.separate_same_combinationcoeff_allN_ = this->get_bool_value("separate_same_combinationcoeff_allN");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_same_combinationcoeff_allN_ is " << para_.separate_same_combinationcoeff_allN_);
+
+        para_.separate_whichN_combinationcoeff_ = this->get_int_value("separate_whichN_combinationcoeff");
+        GADGET_CONDITION_MSG(verboseMode_, "separate_whichN_combinationcoeff_ is " << para_.separate_whichN_combinationcoeff_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        para_.same_coil_compression_coeff_allN_ = this->get_bool_value("same_coil_compression_coeff_allN");
+        GADGET_CONDITION_MSG(verboseMode_, "same_coil_compression_coeff_allN_ is " << para_.same_coil_compression_coeff_allN_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        // get the parameters from base class
+        BaseClass::readParameters();
+
+        para_.recon_kspace_needed_ = recon_kspace_needed_;
+        para_.workOrderPara_ = workOrderPara_;
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusRecon3DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusRecon3DTGadget::setWorkOrder3DTParameters(WorkOrder3DTType* workOrder)
+{
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( para_.workOrderPara_.coil_compression_thres_>0 || para_.workOrderPara_.coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allN_ = para_.same_coil_compression_coeff_allN_;
+
+    workOrder->embedded_averageall_ref_ = para_.embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = para_.embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allN_ = para_.embedded_same_combinationcoeff_allN_;
+    workOrder->embedded_whichN_combinationcoeff_ = para_.embedded_whichN_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = para_.embedded_ref_fillback_;
+
+    workOrder->separate_averageall_ref_ = para_.separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = para_.separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allN_ = para_.separate_same_combinationcoeff_allN_;
+    workOrder->separate_whichN_combinationcoeff_ = para_.separate_whichN_combinationcoeff_;
+
+    //workOrder->interleaved_same_combinationcoeff_allN_ = interleaved_same_combinationcoeff_allN_;
+    //workOrder->interleaved_whichN_combinationcoeff_ = interleaved_whichN_combinationcoeff_;
+
+    workOrder->no_acceleration_averageall_ref_ = para_.no_acceleration_averageall_ref_;
+    workOrder->no_acceleration_same_combinationcoeff_allN_ = para_.no_acceleration_same_combinationcoeff_allN_;
+    workOrder->no_acceleration_whichN_combinationcoeff_ = para_.no_acceleration_whichN_combinationcoeff_;
+
+    return true;
+}
+
+int GtPlusRecon3DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+    GADGET_CHECK_RETURN(BaseClass::process_config(mb)==GADGET_OK, GADGET_FAIL);
+
+    // pre-allocate memory
+    size_t numOfBytes;
+    if ( para_.workOrderPara_.coil_compression_num_modesKept_ > 0 )
+    {
+        if ( num_acq_channels_ > 2*para_.workOrderPara_.coil_compression_num_modesKept_ )
+        {
+            numOfBytes = (double)matrix_size_encoding_[0]*kSpaceMaxAcqE1No_*kSpaceMaxAcqE2No_*num_acq_channels_*para_.workOrderPara_.coil_compression_num_modesKept_*sizeof(ValueType);
+        }
+        else
+        {
+            numOfBytes = (double)matrix_size_encoding_[0]*kSpaceMaxAcqE1No_*kSpaceMaxAcqE2No_*num_acq_channels_*para_.workOrderPara_.coil_compression_num_modesKept_*sizeof(ValueType);
+        }
+
+        if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_GRAPPA && para_.workOrderPara_.job_num_of_N_>0 )
+        {
+            numOfBytes = (double)para_.workOrderPara_.job_num_of_N_*kSpaceMaxAcqE1No_*kSpaceMaxAcqE2No_*num_acq_channels_*para_.workOrderPara_.coil_compression_num_modesKept_*sizeof(ValueType)*1.5;
+        }
+    }
+    else
+    {
+        if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_SPIRIT || para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_L1SPIRIT )
+        {
+            numOfBytes = (double)matrix_size_encoding_[0]*kSpaceMaxAcqE1No_*kSpaceMaxAcqE2No_*num_acq_channels_*num_acq_channels_*sizeof(ValueType)*0.8;
+        }
+        else
+        {
+            numOfBytes = (double)matrix_size_encoding_[0]*kSpaceMaxAcqE1No_*kSpaceMaxAcqE2No_*num_acq_channels_*num_acq_channels_*sizeof(ValueType)*0.6;
+        }
+    }
+
+    if ( (num_acq_channels_<=12) || (para_.workOrderPara_.coil_compression_num_modesKept_>0 && 2*para_.workOrderPara_.coil_compression_num_modesKept_>num_acq_channels_) )
+    {
+        numOfBytes *= 2;
+    }
+
+    if ( numOfBytes > 1024*1024*1024*128.0 )
+    {
+        numOfBytes = 1024*1024*1024*4.0;
+    }
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon3DTGadget::Pre allocate : " << numOfBytes/1024.0/1024.0 << " Megabytes ... ");
+
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Pre-allocate memory ... ", performTiming_);
+    mem_manager_->increase(numOfBytes);
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    worker_grappa_.gtPlus_mem_manager_ = mem_manager_;
+    worker_noacceleration_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_L1_ncg_.gtPlus_mem_manager_ = mem_manager_;
+
+    if ( CloudComputing_ )
+    {
+        bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+        if ( parseSuccess )
+        {
+            CloudSize_ = gt_cloud_.size();
+            if ( CloudSize_ == 0 ) CloudComputing_ = false;
+        }
+        else
+        {
+            CloudComputing_ = false;
+        }
+    }
+
+    return GADGET_OK;
+}
+
+int GtPlusRecon3DTGadget::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon3DTGadget::process(...) starts ... ");
+
+    processed_called_times_++;
+
+    GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+    WorkOrderType* workOrder = m2->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    GADGET_CONDITION_MSG(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+    dimensions_ = *dims;
+
+    // fill in more parameters
+    para_.reconSizeRO_ = matrix_size_recon_[0];
+    para_.reconSizeE1_ = reconE1_;
+    para_.reconSizeE2_ = reconE2_;
+    para_.encodingFOV_RO_ = field_of_view_encoding_[0];
+    para_.encodingFOV_E1_ = field_of_view_encoding_[1];
+    para_.encodingFOV_E2_ = field_of_view_encoding_[2];
+    para_.reconFOV_RO_ = field_of_view_recon_[0];
+    para_.reconFOV_E1_ = field_of_view_recon_[1];
+    para_.reconFOV_E2_ = field_of_view_recon_[2];
+
+    para_.workOrderPara_.CalibMode_ = workOrder->CalibMode_;
+    para_.workOrderPara_.InterleaveDim_ = workOrder->InterleaveDim_;
+
+    para_.workOrderPara_.acceFactorE1_ = workOrder->acceFactorE1_;
+    para_.workOrderPara_.acceFactorE2_ = workOrder->acceFactorE2_;
+
+    para_.workOrderPara_.kSpaceCenterRO_ = workOrder->kSpaceCenterRO_;
+    para_.workOrderPara_.kSpaceCenterEncode1_ = workOrder->kSpaceCenterEncode1_;
+    para_.workOrderPara_.kSpaceCenterEncode2_ = workOrder->kSpaceCenterEncode2_;
+
+    para_.workOrderPara_.kSpaceMaxRO_ = workOrder->kSpaceMaxRO_;
+    para_.workOrderPara_.kSpaceMaxEncode1_ = workOrder->kSpaceMaxEncode1_;
+    para_.workOrderPara_.kSpaceMaxEncode2_ = workOrder->kSpaceMaxEncode2_;
+
+    para_.workOrderPara_.start_RO_ = workOrder->start_RO_;
+    para_.workOrderPara_.end_RO_ = workOrder->end_RO_;
+
+    para_.workOrderPara_.start_E1_ = workOrder->start_E1_;
+    para_.workOrderPara_.end_E1_ = workOrder->end_E1_;
+
+    para_.workOrderPara_.start_E2_ = workOrder->start_E2_;
+    para_.workOrderPara_.end_E2_ = workOrder->end_E2_;
+
+    para_.workOrderPara_.workFlow_BufferKernel_ = workOrder->workFlow_BufferKernel_;
+    para_.workOrderPara_.workFlow_use_BufferedKernel_ = workOrder->workFlow_use_BufferedKernel_;
+    para_.workOrderPara_.num_channels_res_ = workOrder->num_channels_res_;
+
+    // ---------------------------------------------------------
+    // set the work flow
+    // ---------------------------------------------------------
+    workflow_.reconSizeRO_ = para_.reconSizeRO_;
+    workflow_.reconSizeE1_ = para_.reconSizeE1_;
+    workflow_.reconSizeE2_ = para_.reconSizeE2_;
+    workflow_.encodingFOV_RO_ = para_.encodingFOV_RO_;
+    workflow_.encodingFOV_E1_ = para_.encodingFOV_E1_;
+    workflow_.encodingFOV_E2_ = para_.encodingFOV_E2_;
+    workflow_.reconFOV_RO_ = para_.reconFOV_RO_;
+    workflow_.reconFOV_E1_ = para_.reconFOV_E1_;
+    workflow_.reconFOV_E2_ = para_.reconFOV_E2_;
+
+    workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim5th_ = para_.dim_5th_;
+    workflow_.WorkOrderShareDim_ = para_.workOrder_ShareDim_;
+    workflow_.performTiming_ = performTiming_;
+
+    // ---------------------------------------------------------
+    // set work order
+    // ---------------------------------------------------------
+    workOrder->copyFromPara(para_.workOrderPara_);
+
+    workOrder->CloudComputing_ = CloudComputing_;
+    workOrder->CloudSize_ = CloudSize_;
+    workOrder->gt_cloud_ = gt_cloud_;
+
+    // ---------------------------------------------------------
+    // set the worker
+    // ---------------------------------------------------------
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+    // if 'other' data is coming in
+    if ( workOrder->other_.get_number_of_elements() > 0 )
+    {
+        workOrder->duplicate(workOrder_recon_other_);
+        setWorkOrder3DTParameters(&workOrder_recon_other_);
+        workflow_.workOrder_ = &workOrder_recon_other_;
+
+        // perform a simple FFT recon
+        workOrder_recon_other_.acceFactorE1_ = 1;
+        workOrder_recon_other_.acceFactorE2_ = 1;
+
+        workOrder_recon_other_.start_RO_ = -1;
+        workOrder_recon_other_.end_RO_ = -1;
+        workOrder_recon_other_.start_E1_ = -1;
+        workOrder_recon_other_.end_E1_ = -1;
+        workOrder_recon_other_.start_E2_ = -1;
+        workOrder_recon_other_.end_E2_ = -1;
+
+        workflow_.worker_ = &worker_noacceleration_;
+        workflow_.setDataArray(workOrder->other_);
+        GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+
+       GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_+1, workOrder->dataDimStartingIndexes_, "Other"), GADGET_FAIL);
+
+        workflow_.res_.clear();
+        workflow_.data_ = NULL;
+        workflow_.ref_ = NULL;
+        workflow_.workOrder_ = NULL;
+
+        workOrder_recon_other_.reset();
+    }
+
+    // perform the recon
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Recon 3DT workorder ... ", performTiming_);
+
+    GADGET_CHECK_RETURN(this->generateKSpaceFilter(*workOrder), GADGET_FAIL);
+
+    workOrder->duplicate(workOrder_recon_);
+    setWorkOrder3DTParameters(&workOrder_recon_);
+
+    workflow_.workOrder_ = &workOrder_recon_;
+    if ( verboseMode_ )
+    {
+        workflow_.workOrder_->print(std::cout);
+    }
+
+    workflow_.setDataArray(workOrder->data_);
+
+    if ( workOrder->ref_.get_number_of_elements() > 0 )
+    {
+        workflow_.setRefArray(workOrder->ref_);
+    }
+    else if ( CalibMode_==Gadgetron::gtPlus::ISMRMRD_interleaved )
+    {
+        workOrder->ref_ = workOrder->data_;
+        workflow_.setRefArray(workOrder->ref_);
+    }
+
+    // set the work flow for worker and workOrder
+    if ( workOrder->acceFactorE1_>1 || workOrder->acceFactorE2_>1 )
+    {
+        if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_;
+        }
+        else if ( para_.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_L1SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_L1_ncg_;
+        }
+        else
+        {
+            workflow_.worker_ = &worker_grappa_;
+        }
+    }
+    else
+    {
+        workflow_.worker_ = &worker_noacceleration_;
+    }
+
+    GADGET_CHECK_RETURN(workflow_.preProcessing(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.recon(), GADGET_FAIL);
+    GADGET_CHECK_RETURN(workflow_.postProcessing(), GADGET_FAIL);
+
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "Recon3DT";
+
+        hoNDArray<GT_Complex8> res = workflow_.res_;
+        res.squeeze();
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, res, ostr.str());
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutRecon(images, workflow_.res_, image_series_, workOrder->dataDimStartingIndexes_, "Image"), GADGET_FAIL);
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusRecon3DTGadget::process(...) ends ... ");
+
+    // reset the status
+    workflow_.data_ = NULL;
+    workflow_.ref_ = NULL;
+    workflow_.noise_ = NULL;
+    workflow_.workOrder_ = NULL;
+    // Gadgetron::clear(&workflow_.res_);
+
+    m1->release();
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusRecon3DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusRecon3DTGadget.h b/gadgets/gtPlus/GtPlusRecon3DTGadget.h
new file mode 100644
index 0000000..1c2e340
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusRecon3DTGadget.h
@@ -0,0 +1,105 @@
+/** \file   GtPlusRecon3DTGadget.h
+    \brief  This gadget encapsulates the reconstruction for 3DT cases.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusReconGadget.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusISMRMRDReconWorker3DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker3DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h"
+#include "gtPlusMemoryManager.h"
+
+namespace Gadgetron
+{
+
+struct EXPORTGTPLUSGADGET GtPlusRecon3DTPara
+{
+    size_t reconSizeRO_;
+    size_t reconSizeE1_;
+    size_t reconSizeE2_;
+
+    float encodingFOV_RO_;
+    float encodingFOV_E1_;
+    float encodingFOV_E2_;
+
+    float reconFOV_RO_;
+    float reconFOV_E1_;
+    float reconFOV_E2_;
+
+    Gadgetron::gtPlus::ISMRMRDDIM dim_5th_;
+    Gadgetron::gtPlus::ISMRMRDDIM workOrder_ShareDim_;
+
+    bool no_acceleration_averageall_ref_;
+    bool no_acceleration_same_combinationcoeff_allN_;
+    int no_acceleration_whichN_combinationcoeff_;
+
+    bool interleaved_same_combinationcoeff_allN_;
+    int interleaved_whichN_combinationcoeff_;
+
+    bool embedded_averageall_ref_;
+    bool embedded_fullres_coilmap_;
+    bool embedded_same_combinationcoeff_allN_;
+    int embedded_whichN_combinationcoeff_;
+    bool embedded_ref_fillback_;
+
+    bool separate_averageall_ref_;
+    bool separate_fullres_coilmap_;
+    bool separate_same_combinationcoeff_allN_;
+    int separate_whichN_combinationcoeff_;
+
+    bool same_coil_compression_coeff_allN_;
+
+    bool recon_kspace_needed_;
+
+    Gadgetron::gtPlus::gtPlusReconWorkOrderPara workOrderPara_;
+};
+
+class EXPORTGTPLUSGADGET GtPlusRecon3DTGadget : public GtPlusReconGadget
+{
+public:
+    GADGET_DECLARE(GtPlusRecon3DTGadget);
+
+    typedef GtPlusReconGadget BaseClass;
+
+    typedef BaseClass::ValueType ValueType;
+
+    typedef BaseClass::WorkOrderType WorkOrderType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder3DT<ValueType> WorkOrder3DTType;
+
+    typedef BaseClass::DimensionRecordType DimensionRecordType;
+
+    GtPlusRecon3DTGadget();
+    ~GtPlusRecon3DTGadget();
+
+    GtPlusRecon3DTPara para_;
+
+protected:
+
+    virtual bool readParameters();
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    // set 3DT specific work order parameters
+    bool setWorkOrder3DTParameters(WorkOrder3DTType* workOrder);
+
+    // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian3DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker3DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    // workOrder for recon
+    WorkOrder3DTType workOrder_recon_;
+
+    // workOrder for recon 'other' data
+    WorkOrder3DTType workOrder_recon_other_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconGadget.cpp b/gadgets/gtPlus/GtPlusReconGadget.cpp
new file mode 100644
index 0000000..e0b1753
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconGadget.cpp
@@ -0,0 +1,1478 @@
+
+#include "GtPlusReconGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconGadget::GtPlusReconGadget() : mem_manager_(new Gadgetron::gtPlus::gtPlusMemoryManager(4, 640*1024*1024))
+{
+    image_series_ = 100;
+
+    min_intensity_value_ = 64;
+    max_intensity_value_ = 4095;
+
+    max_intensity_value_US_ = 2048;
+
+    scalingFactor_ = -1;
+    use_constant_scalingFactor_ = false;
+
+    timeStampResolution_ = 0.0025f;
+
+    aSpacing_[0] = 2.0;
+    aSpacing_[1] = 2.0;
+    aSpacing_[2] = 6.0;
+    aSpacing_[3] = 1.0;
+    aSpacing_[4] = 1.0;
+    aSpacing_[5] = 1.0;
+
+    reconE1_ = 1;
+    reconE2_ = 1;
+
+    processed_called_times_ = 0;
+
+    kSpaceMaxAcqE2No_ = 0;
+
+    filterRO_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterRO_sigma_ = 1.5;
+    filterRO_width_ = 0.15;
+
+    filterE1_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterE1_sigma_ = 1.5;
+    filterE1_width_ = 0.15;
+
+    filterE2_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterE2_sigma_ = 1.5;
+    filterE2_width_ = 0.15;
+
+    filterRO_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterRO_ref_sigma_ = 1.5;
+    filterRO_ref_width_ = 0.15;
+
+    filterE1_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterE1_ref_sigma_ = 1.5;
+    filterE1_ref_width_ = 0.15;
+
+    filterE2_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterE2_ref_sigma_ = 1.5;
+    filterE2_ref_width_ = 0.15;
+
+    filterRO_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterRO_pf_sigma_ = 1.5;
+    filterRO_pf_width_ = 0.15;
+    filterRO_pf_densityComp_ = false;
+
+    filterE1_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterE1_pf_sigma_ = 1.5;
+    filterE1_pf_width_ = 0.15;
+    filterE1_pf_densityComp_ = false;
+
+    filterE2_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterE2_pf_sigma_ = 1.5;
+    filterE2_pf_width_ = 0.15;
+    filterE2_pf_densityComp_ = false;
+
+    debugFolder_ = "DebugOutput";
+    debugFolder2_ = debugFolder_;
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    CloudComputing_ = false;
+    CloudSize_ = 0;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    Gadgetron::prepOpenMP();
+    Gadgetron::prepMKL();
+}
+
+GtPlusReconGadget::~GtPlusReconGadget()
+{
+
+}
+
+bool GtPlusReconGadget::readParameters()
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlusReconGadget parameters <------");
+
+        min_intensity_value_ = this->get_int_value("min_intensity_value");
+        GADGET_CONDITION_MSG(verboseMode_, "min_intensity_value_ is " << min_intensity_value_);
+
+        max_intensity_value_ = this->get_int_value("max_intensity_value");
+        GADGET_CONDITION_MSG(verboseMode_, "max_intensity_value_ is " << max_intensity_value_);
+
+        scalingFactor_ = this->get_double_value("scalingFactor");
+        GADGET_CONDITION_MSG(verboseMode_, "scalingFactor_ is " << scalingFactor_);
+
+        use_constant_scalingFactor_ = this->get_bool_value("use_constant_scalingFactor");
+        GADGET_CONDITION_MSG(verboseMode_, "use_constant_scalingFactor_ is " << use_constant_scalingFactor_);
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        boost::shared_ptr<std::string> str2 = this->get_string_value("debugFolder2");
+        debugFolder2_ = *str2;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder2_ is " << debugFolder2_);
+
+        timeStampResolution_ = (float)this->get_double_value("timeStampResolution");
+        GADGET_CONDITION_MSG(verboseMode_, "timeStampResolution_ is " << timeStampResolution_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GADGET_CONDITION_MSG(verboseMode_, "performTiming_ is " << performTiming_);
+
+        // kspace filter parameters
+        str = this->get_string_value("filterRO");
+        filterRO_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_sigma_ = this->get_double_value("filterRO_sigma");
+        filterRO_width_ = this->get_double_value("filterRO_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_sigma_ is " << filterRO_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_width_ is " << filterRO_width_);
+
+        str = this->get_string_value("filterE1");
+        filterE1_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_sigma_ = this->get_double_value("filterE1_sigma");
+        filterE1_width_ = this->get_double_value("filterE1_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_sigma_ is " << filterE1_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_width_ is " << filterE1_width_);
+
+        str = this->get_string_value("filterE2");
+        filterE2_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_sigma_ = this->get_double_value("filterE2_sigma");
+        filterE2_width_ = this->get_double_value("filterE2_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_sigma_ is " << filterE2_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_width_ is " << filterE2_width_);
+
+        str = this->get_string_value("filterRefRO");
+        filterRO_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_ref_sigma_ = this->get_double_value("filterRefRO_sigma");
+        filterRO_ref_width_ = this->get_double_value("filterRefRO_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_ref_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_ref_sigma_ is " << filterRO_ref_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_ref_width_ is " << filterRO_ref_width_);
+
+        str = this->get_string_value("filterRefE1");
+        filterE1_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_ref_sigma_ = this->get_double_value("filterRefE1_sigma");
+        filterE1_ref_width_ = this->get_double_value("filterRefE1_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_ref_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_ref_sigma_ is " << filterE1_ref_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_ref_width_ is " << filterE1_ref_width_);
+
+        str = this->get_string_value("filterRefE2");
+        filterE2_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_ref_sigma_ = this->get_double_value("filterRefE2_sigma");
+        filterE2_ref_width_ = this->get_double_value("filterRefE2_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_ref_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_ref_sigma_ is " << filterE2_ref_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_ref_width_ is " << filterE2_ref_width_);
+
+        str = this->get_string_value("filterPartialFourierRO");
+        filterRO_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_pf_sigma_ = this->get_double_value("filterPartialFourierRO_sigma");
+        filterRO_pf_width_ = this->get_double_value("filterPartialFourierRO_width");
+        filterRO_pf_densityComp_ = this->get_bool_value("filterPartialFourierRO_densityComp");
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_sigma_ is " << filterRO_pf_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_width_ is " << filterRO_pf_width_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_densityComp_ is " << filterRO_pf_densityComp_);
+
+        str = this->get_string_value("filterPartialFourierE1");
+        filterE1_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_pf_sigma_ = this->get_double_value("filterPartialFourierE1_sigma");
+        filterE1_pf_width_ = this->get_double_value("filterPartialFourierE1_width");
+        filterE1_pf_densityComp_ = this->get_bool_value("filterPartialFourierE1_densityComp");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_sigma_ is " << filterE1_pf_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_width_ is " << filterE1_pf_width_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_densityComp_ is " << filterE1_pf_densityComp_);
+
+        str = this->get_string_value("filterPartialFourierE2");
+        filterE2_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_pf_sigma_ = this->get_double_value("filterPartialFourierE2_sigma");
+        filterE2_pf_width_ = this->get_double_value("filterPartialFourierE2_width");
+        filterE2_pf_densityComp_ = this->get_bool_value("filterPartialFourierE2_densityComp");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_sigma_ is " << filterE2_pf_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_width_ is " << filterE2_pf_width_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_densityComp_ is " << filterE2_pf_densityComp_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        CloudComputing_ = this->get_bool_value("CloudComputing");
+        CloudSize_ = (unsigned int)(this->get_int_value("CloudSize"));
+
+        GADGET_CONDITION_MSG(verboseMode_, "CloudComputing_ is " << CloudComputing_);
+        GADGET_CONDITION_MSG(verboseMode_, "CloudSize_ is " << CloudSize_);
+
+        str = this->get_string_value("cloudNodeFile");
+        cloud_node_file_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "cloud_node_file_ is " << cloud_node_file_);
+
+        // read in the cloud information for every node
+        gt_cloud_.resize(CloudSize_);
+
+        for ( unsigned int ii=0; ii<CloudSize_; ii++ )
+        {
+            std::ostringstream ostreamstr1;
+            ostreamstr1 << "CloudNode" << ii << "_IP" << std::ends;
+            boost::shared_ptr<std::string> IP = this->get_string_value(ostreamstr1.str().c_str());
+            gt_cloud_[ii].get<0>() = *IP;
+
+            std::ostringstream ostreamstr2;
+            ostreamstr2 << "CloudNode" << ii << "_Port" << std::ends;
+            boost::shared_ptr<std::string> Port = this->get_string_value(ostreamstr2.str().c_str());
+            gt_cloud_[ii].get<1>() = *Port;
+
+            std::ostringstream ostreamstr3;
+            ostreamstr3 << "CloudNode" << ii << "_XMLConfiguration" << std::ends;
+            boost::shared_ptr<std::string> xmlName = this->get_string_value(ostreamstr3.str().c_str());
+            gt_cloud_[ii].get<2>() = *xmlName;
+
+            std::ostringstream ostreamstr4;
+            ostreamstr4 << "CloudNode" << ii << "_ComputingPowerIndex" << std::ends;
+            unsigned int computingPowerIndex = this->get_int_value(ostreamstr4.str().c_str());
+            gt_cloud_[ii].get<3>() = computingPowerIndex;
+
+            GADGET_CONDITION_MSG(verboseMode_, "Cloud Node " << ii << " : " << gt_cloud_[ii]);
+        }
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        GADGET_CONDITION_MSG(verboseMode_, "==================================================================");
+
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlus recon parameters <------");
+
+        workOrderPara_.upstream_coil_compression_ = this->get_bool_value("upstream_coil_compression");
+        GADGET_CONDITION_MSG(verboseMode_, "upstream_coil_compression_ is " << workOrderPara_.upstream_coil_compression_);
+
+        workOrderPara_.upstream_coil_compression_thres_ = this->get_double_value("upstream_coil_compression_thres");
+        GADGET_CONDITION_MSG(verboseMode_, "upstream_coil_compression_thres_ is " << workOrderPara_.upstream_coil_compression_thres_);
+
+        workOrderPara_.upstream_coil_compression_num_modesKept_ = this->get_int_value("upstream_coil_compression_num_modesKept");
+        GADGET_CONDITION_MSG(verboseMode_, "upstream_coil_compression_num_modesKept_ is " << workOrderPara_.upstream_coil_compression_num_modesKept_);
+
+        workOrderPara_.downstream_coil_compression_ = this->get_bool_value("downstream_coil_compression");
+        GADGET_CONDITION_MSG(verboseMode_, "downstream_coil_compression_ is " << workOrderPara_.downstream_coil_compression_);
+
+        workOrderPara_.coil_compression_thres_ = this->get_double_value("coil_compression_thres");
+        GADGET_CONDITION_MSG(verboseMode_, "coil_compression_thres_ is " << workOrderPara_.coil_compression_thres_);
+
+        workOrderPara_.coil_compression_num_modesKept_ = this->get_int_value("coil_compression_num_modesKept");
+        GADGET_CONDITION_MSG(verboseMode_, "coil_compression_num_modesKept_ is " << workOrderPara_.coil_compression_num_modesKept_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        str = this->get_string_value("coil_map_algorithm");
+        workOrderPara_.coil_map_algorithm_ = gtPlus_util_.getISMRMRDCoilMapAlgoFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "coil_map_algorithm_ is " << *str);
+
+        workOrderPara_.csm_kSize_ = (size_t)(this->get_int_value("csm_kSize"));
+        GADGET_CONDITION_MSG(verboseMode_, "csm_kSize_ is " << workOrderPara_.csm_kSize_);
+
+        workOrderPara_.csm_powermethod_num_ = (size_t)(this->get_int_value("csm_powermethod_num"));
+        GADGET_CONDITION_MSG(verboseMode_, "csm_powermethod_num_ is " << workOrderPara_.csm_powermethod_num_);
+
+        workOrderPara_.csm_true_3D_ = this->get_bool_value("csm_true_3D");
+        GADGET_CONDITION_MSG(verboseMode_, "csm_true_3D_ is " << workOrderPara_.csm_true_3D_);
+
+        workOrderPara_.csm_iter_num_ = (size_t)(this->get_int_value("csm_iter_num"));
+        GADGET_CONDITION_MSG(verboseMode_, "csm_iter_num_ is " << workOrderPara_.csm_iter_num_);
+
+        workOrderPara_.csm_iter_thres_ = this->get_double_value("csm_iter_thres");
+        GADGET_CONDITION_MSG(verboseMode_, "csm_iter_thres_ is " << workOrderPara_.csm_iter_thres_);
+
+        workOrderPara_.csm_use_gpu_ = this->get_bool_value("csm_use_gpu");
+        GADGET_CONDITION_MSG(verboseMode_, "csm_use_gpu_ is " << workOrderPara_.csm_use_gpu_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        str = this->get_string_value("recon_algorithm");
+        workOrderPara_.recon_algorithm_ = gtPlus_util_.getISMRMRDReconAlgoFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "recon_algorithm_ is " << *str);
+
+        workOrderPara_.recon_auto_parameters_ = this->get_bool_value("recon_auto_parameters");
+        GADGET_CONDITION_MSG(verboseMode_, "recon_auto_parameters_ is " << workOrderPara_.recon_auto_parameters_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        workOrderPara_.grappa_kSize_RO_ = (size_t)(this->get_int_value("grappa_kSize_RO"));
+        workOrderPara_.grappa_kSize_E1_ = (size_t)(this->get_int_value("grappa_kSize_E1"));
+        workOrderPara_.grappa_kSize_E2_ = (size_t)(this->get_int_value("grappa_kSize_E2"));
+        workOrderPara_.grappa_reg_lamda_ = this->get_double_value("grappa_reg_lamda");
+        workOrderPara_.grappa_calib_over_determine_ratio_ = this->get_double_value("grappa_calib_over_determine_ratio");
+        workOrderPara_.grappa_use_gpu_ = this->get_bool_value("grappa_use_gpu");
+
+        GADGET_CONDITION_MSG(verboseMode_, "grappa_kSize_RO_ is " << workOrderPara_.grappa_kSize_RO_);
+        GADGET_CONDITION_MSG(verboseMode_, "grappa_kSize_E1_ is " << workOrderPara_.grappa_kSize_E1_);
+        GADGET_CONDITION_MSG(verboseMode_, "grappa_kSize_E2_ is " << workOrderPara_.grappa_kSize_E2_);
+        GADGET_CONDITION_MSG(verboseMode_, "grappa_reg_lamda_ is " << workOrderPara_.grappa_reg_lamda_);
+        GADGET_CONDITION_MSG(verboseMode_, "grappa_calib_over_determine_ratio_ is " << workOrderPara_.grappa_calib_over_determine_ratio_);
+        GADGET_CONDITION_MSG(verboseMode_, "grappa_use_gpu_ is " << workOrderPara_.grappa_use_gpu_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        workOrderPara_.spirit_kSize_RO_ = (size_t)(this->get_int_value("spirit_kSize_RO"));
+        workOrderPara_.spirit_kSize_E1_ = (size_t)(this->get_int_value("spirit_kSize_E1"));
+        workOrderPara_.spirit_kSize_E2_ = (size_t)(this->get_int_value("spirit_kSize_E2"));
+        workOrderPara_.spirit_reg_lamda_ = this->get_double_value("spirit_reg_lamda");
+        workOrderPara_.spirit_use_gpu_ = this->get_bool_value("spirit_use_gpu");
+        workOrderPara_.spirit_calib_over_determine_ratio_ = this->get_double_value("spirit_calib_over_determine_ratio");
+        workOrderPara_.spirit_solve_symmetric_ = this->get_bool_value("spirit_solve_symmetric");
+        workOrderPara_.spirit_iter_max_ = (size_t)(this->get_int_value("spirit_iter_max"));
+        workOrderPara_.spirit_iter_thres_ = this->get_double_value("spirit_iter_thres");
+        workOrderPara_.spirit_print_iter_ = this->get_bool_value("spirit_print_iter");
+
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_kSize_RO_ is " << workOrderPara_.spirit_kSize_RO_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_kSize_E1_ is " << workOrderPara_.spirit_kSize_E1_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_kSize_E2_ is " << workOrderPara_.spirit_kSize_E2_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_reg_lamda_ is " << workOrderPara_.spirit_reg_lamda_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_use_gpu_ is " << workOrderPara_.spirit_use_gpu_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_calib_over_determine_ratio_ is " << workOrderPara_.spirit_calib_over_determine_ratio_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_solve_symmetric_ is " << workOrderPara_.spirit_solve_symmetric_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_iter_max_ is " << workOrderPara_.spirit_iter_max_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_iter_thres_ is " << workOrderPara_.spirit_iter_thres_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_print_iter_ is " << workOrderPara_.spirit_print_iter_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        workOrderPara_.spirit_perform_linear_ = this->get_bool_value("spirit_perform_linear");
+        workOrderPara_.spirit_perform_nonlinear_ = this->get_bool_value("spirit_perform_nonlinear");
+        workOrderPara_.spirit_parallel_imaging_lamda_ = this->get_double_value("spirit_parallel_imaging_lamda");
+        workOrderPara_.spirit_image_reg_lamda_ = this->get_double_value("spirit_image_reg_lamda");
+        workOrderPara_.spirit_data_fidelity_lamda_ = this->get_double_value("spirit_data_fidelity_lamda");
+        workOrderPara_.spirit_ncg_iter_max_ = (size_t)(this->get_int_value("spirit_ncg_iter_max"));
+        workOrderPara_.spirit_ncg_iter_thres_ = this->get_double_value("spirit_ncg_iter_thres");
+        workOrderPara_.spirit_ncg_print_iter_ = this->get_bool_value("spirit_ncg_print_iter");
+        // spirit_ncg_scale_factor_ is computed from the data
+        workOrderPara_.spirit_use_coil_sen_map_ = this->get_bool_value("spirit_use_coil_sen_map");
+        workOrderPara_.spirit_use_moco_enhancement_ = this->get_bool_value("spirit_use_moco_enhancement");
+        workOrderPara_.spirit_recon_moco_images_ = this->get_bool_value("spirit_recon_moco_images");
+        workOrderPara_.spirit_RO_enhancement_ratio_ = this->get_double_value("spirit_RO_enhancement_ratio");
+        workOrderPara_.spirit_E1_enhancement_ratio_ = this->get_double_value("spirit_E1_enhancement_ratio");
+        workOrderPara_.spirit_E2_enhancement_ratio_ = this->get_double_value("spirit_E2_enhancement_ratio");
+        workOrderPara_.spirit_temporal_enhancement_ratio_ = this->get_double_value("spirit_temporal_enhancement_ratio");
+        workOrderPara_.spirit_2D_scale_per_chunk_ = this->get_bool_value("spirit_2D_scale_per_chunk");
+        workOrderPara_.spirit_3D_scale_per_chunk_ = this->get_bool_value("spirit_3D_scale_per_chunk");
+
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_perform_linear_ is " << workOrderPara_.spirit_perform_linear_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_perform_nonlinear_ is " << workOrderPara_.spirit_perform_nonlinear_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_parallel_imaging_lamda_ is " << workOrderPara_.spirit_parallel_imaging_lamda_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_image_reg_lamda_ is " << workOrderPara_.spirit_image_reg_lamda_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_data_fidelity_lamda_ is " << workOrderPara_.spirit_data_fidelity_lamda_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_ncg_iter_max_ is " << workOrderPara_.spirit_ncg_iter_max_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_ncg_iter_thres_ is " << workOrderPara_.spirit_ncg_iter_thres_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_ncg_print_iter_ is " << workOrderPara_.spirit_ncg_print_iter_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_use_coil_sen_map_ is " << workOrderPara_.spirit_use_coil_sen_map_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_use_moco_enhancement_ is " << workOrderPara_.spirit_use_moco_enhancement_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_recon_moco_images_ is " << workOrderPara_.spirit_recon_moco_images_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_RO_enhancement_ratio_ is " << workOrderPara_.spirit_RO_enhancement_ratio_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_E1_enhancement_ratio_ is " << workOrderPara_.spirit_E1_enhancement_ratio_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_E2_enhancement_ratio_ is " << workOrderPara_.spirit_E2_enhancement_ratio_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_temporal_enhancement_ratio_ is " << workOrderPara_.spirit_temporal_enhancement_ratio_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_2D_scale_per_chunk_ is " << workOrderPara_.spirit_2D_scale_per_chunk_);
+        GADGET_CONDITION_MSG(verboseMode_, "spirit_3D_scale_per_chunk_ is " << workOrderPara_.spirit_3D_scale_per_chunk_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        workOrderPara_.job_split_by_S_ = this->get_bool_value("job_split_by_S");
+        workOrderPara_.job_num_of_N_ = (size_t)(this->get_int_value("job_num_of_N"));
+        workOrderPara_.job_max_Megabytes_ = (size_t)(this->get_int_value("job_max_Megabytes"));
+        workOrderPara_.job_overlap_ = (size_t)(this->get_int_value("job_overlap"));
+        workOrderPara_.job_perform_on_control_node_ = this->get_bool_value("job_perform_on_control_node");
+
+        GADGET_CONDITION_MSG(verboseMode_, "job_split_by_S_ is " << workOrderPara_.job_split_by_S_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_num_of_N_ is " << workOrderPara_.job_num_of_N_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_max_Megabytes_ is " << workOrderPara_.job_max_Megabytes_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_overlap_ is " << workOrderPara_.job_overlap_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_perform_on_control_node_ is " << workOrderPara_.job_perform_on_control_node_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        str = this->get_string_value("partialFourier_algo");
+        workOrderPara_.partialFourier_algo_ = gtPlus_util_.getISMRMRDPartialFourierReconAlgoFromName(*str);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_algo_ is " << *str);
+
+        workOrderPara_.partialFourier_homodyne_iters_ = (size_t)(this->get_int_value("partialFourier_homodyne_iters"));
+        workOrderPara_.partialFourier_homodyne_thres_ = this->get_double_value("partialFourier_homodyne_thres");
+        workOrderPara_.partialFourier_homodyne_densityComp_ = this->get_bool_value("partialFourier_homodyne_densityComp");
+
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_homodyne_iters_ is " << workOrderPara_.partialFourier_homodyne_iters_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_homodyne_thres_ is " << workOrderPara_.partialFourier_homodyne_thres_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_homodyne_densityComp_ is " << workOrderPara_.partialFourier_homodyne_densityComp_);
+
+        workOrderPara_.partialFourier_POCS_iters_ = (size_t)(this->get_int_value("partialFourier_POCS_iters"));
+        workOrderPara_.partialFourier_POCS_thres_ = this->get_double_value("partialFourier_POCS_thres");
+        workOrderPara_.partialFourier_POCS_transitBand_ = (size_t)(this->get_int_value("partialFourier_POCS_transitBand"));
+        workOrderPara_.partialFourier_POCS_transitBand_E2_ = (size_t)(this->get_int_value("partialFourier_POCS_transitBand_E2"));
+
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_POCS_iters_ is " << workOrderPara_.partialFourier_POCS_iters_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_POCS_thres_ is " << workOrderPara_.partialFourier_POCS_thres_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_POCS_transitBand_ is " << workOrderPara_.partialFourier_POCS_transitBand_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_POCS_transitBand_ is " << workOrderPara_.partialFourier_POCS_transitBand_E2_);
+
+        workOrderPara_.partialFourier_FengHuang_kSize_RO_ = (size_t)(this->get_int_value("partialFourier_FengHuang_kSize_RO"));
+        workOrderPara_.partialFourier_FengHuang_kSize_E1_ = (size_t)(this->get_int_value("partialFourier_FengHuang_kSize_E1"));
+        workOrderPara_.partialFourier_FengHuang_kSize_E2_ = (size_t)(this->get_int_value("partialFourier_FengHuang_kSize_E2"));
+        workOrderPara_.partialFourier_FengHuang_thresReg_ = this->get_double_value("partialFourier_FengHuang_thresReg");
+        workOrderPara_.partialFourier_FengHuang_sameKernel_allN_ = this->get_bool_value("partialFourier_FengHuang_sameKernel_allN");
+        workOrderPara_.partialFourier_FengHuang_transitBand_ = (size_t)(this->get_int_value("partialFourier_FengHuang_transitBand"));
+        workOrderPara_.partialFourier_FengHuang_transitBand_E2_ = (size_t)(this->get_int_value("partialFourier_FengHuang_transitBand_E2"));
+
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_kSize_RO_ is " << workOrderPara_.partialFourier_FengHuang_kSize_RO_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_kSize_E1_ is " << workOrderPara_.partialFourier_FengHuang_kSize_E1_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_kSize_E2_ is " << workOrderPara_.partialFourier_FengHuang_kSize_E2_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_thresReg_ is " << workOrderPara_.partialFourier_FengHuang_thresReg_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_sameKernel_allN_ is " << workOrderPara_.partialFourier_FengHuang_sameKernel_allN_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_transitBand_ is " << workOrderPara_.partialFourier_FengHuang_transitBand_);
+        GADGET_CONDITION_MSG(verboseMode_, "partialFourier_FengHuang_transitBand_E2_ is " << workOrderPara_.partialFourier_FengHuang_transitBand_E2_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        recon_kspace_needed_ = this->get_bool_value("recon_kspace_needed");
+        GADGET_CONDITION_MSG(verboseMode_, "recon_kspace_needed_ is " << recon_kspace_needed_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud)
+{
+    std::string nodeFileName = ACE_OS::getenv("GADGETRON_HOME");
+    nodeFileName.append("/config/gtCloud/");
+    nodeFileName.append(filename);
+    GADGET_CONDITION_MSG(verboseMode_, "Cloud node file name is " << nodeFileName);
+
+    std::ifstream fs(nodeFileName.c_str(), std::ios::in);
+    if (!fs.is_open()) 
+    {
+        GADGET_WARN_MSG("Cannot open GT CloudNodeFile; use the local setting instead ... ");
+        return false;
+    }
+
+    // control node hostname
+    std::string controlNode;
+    fs >> controlNode;
+
+    std::string portControlNode;
+    fs >> portControlNode;
+
+    // number of GadgetLevel nodes
+    unsigned int num;
+    fs >> num;
+
+    gtCloud.resize(num);
+
+    unsigned int n;
+    for ( n=0; n<num; n++ )
+    {
+        std::string gadgetNode;
+        fs >> gadgetNode;
+
+        std::string portGadgetNode;
+        fs >> portGadgetNode;
+
+        std::string xmlGadgetNode;
+        fs >> xmlGadgetNode;
+
+        unsigned int computingPowerIndex;
+        fs >> computingPowerIndex;
+
+        gtCloud[n].get<0>() = gadgetNode;
+        gtCloud[n].get<1>() = portGadgetNode;
+        gtCloud[n].get<2>() = xmlGadgetNode;
+        gtCloud[n].get<3>() = computingPowerIndex;
+
+        GADGET_CONDITION_MSG(verboseMode_, "Gadget Node " << n << " : " << gt_cloud_[n]);
+    }
+
+    fs.close();
+
+    return true;
+}
+
+int GtPlusReconGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read parameters from xml
+    image_series_ = this->get_int_value("image_series");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = Gadgetron::parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    ISMRMRD::ismrmrdHeader::acquisitionSystemInformation_optional e_acq = cfg->acquisitionSystemInformation();
+    num_acq_channels_ = e_acq->receiverChannels().get();
+    GADGET_CONDITION_MSG(verboseMode_, "Number of acquisition channels : " << num_acq_channels_);
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1)
+    {
+        GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+        GADGET_DEBUG1("This simple GtPlusReconGadget only supports one encoding space\n");
+        return GADGET_FAIL;
+    }
+
+    // find out the encoding space 
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    matrix_size_encoding_[0] = e_space.matrixSize().x();
+    matrix_size_encoding_[1] = e_space.matrixSize().y();
+    matrix_size_encoding_[2] = e_space.matrixSize().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Encoding matrix size: " << matrix_size_encoding_[0] << " " << matrix_size_encoding_[1] << " " << matrix_size_encoding_[2]);
+
+    field_of_view_encoding_[0] = e_space.fieldOfView_mm().x();
+    field_of_view_encoding_[1] = e_space.fieldOfView_mm().y();
+    field_of_view_encoding_[2] = e_space.fieldOfView_mm().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Encoding field_of_view : " << field_of_view_encoding_[0] << " " << field_of_view_encoding_[1] << " " << field_of_view_encoding_[2]);
+
+    // find the recon space
+    matrix_size_recon_[0] = r_space.matrixSize().x();
+    matrix_size_recon_[1] = r_space.matrixSize().y();
+    matrix_size_recon_[2] = r_space.matrixSize().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Recon matrix size : " << matrix_size_recon_[0] << " " << matrix_size_recon_[1] << " " << matrix_size_recon_[2]);
+
+    field_of_view_recon_[0] = r_space.fieldOfView_mm().x();
+    field_of_view_recon_[1] = r_space.fieldOfView_mm().y();
+    field_of_view_recon_[2] = r_space.fieldOfView_mm().z();
+    GADGET_CONDITION_MSG(verboseMode_, "Recon field_of_view :  " << field_of_view_recon_[0] << " " << field_of_view_recon_[1] << " " << field_of_view_recon_[2]);
+
+    reconE1_ = matrix_size_recon_[1];
+    GADGET_CONDITION_MSG(verboseMode_, "reconE1_ is " << reconE1_);
+
+    reconE2_ = matrix_size_recon_[2];
+    GADGET_CONDITION_MSG(verboseMode_, "reconE2_ is " << reconE2_);
+
+    kSpaceMaxAcqE1No_ = e_limits.kspace_encoding_step_1().get().maximum();
+    GADGET_CONDITION_MSG(verboseMode_, "kSpaceMaxAcqE1No_ is " << kSpaceMaxAcqE1No_);
+
+    kSpaceMaxAcqE2No_ = e_limits.kspace_encoding_step_2().get().maximum();
+    GADGET_CONDITION_MSG(verboseMode_, "kSpaceMaxAcqE2No_ is " << kSpaceMaxAcqE2No_);
+
+    aSpacing_[0] = field_of_view_recon_[0]/matrix_size_recon_[0];
+    aSpacing_[1] = field_of_view_recon_[1]/reconE1_;
+    aSpacing_[2] = field_of_view_recon_[2]/reconE2_;
+
+    gt_exporter_.setPixelSize(aSpacing_[0], aSpacing_[1], aSpacing_[2], aSpacing_[3], aSpacing_[4], aSpacing_[5], aSpacing_[6]);
+
+    // find the maximal encoding size
+    if (e_limits.kspace_encoding_step_1().present()) 
+    {
+        meas_max_idx_.kspace_encode_step_1 = e_limits.kspace_encoding_step_1().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.kspace_encode_step_1 = 0;
+        std::cout << "Setting number of kspace_encode_step_1 to 0" << std::endl;
+        return GADGET_FAIL;
+    }
+
+    if (e_limits.set().present())
+    {
+        meas_max_idx_.set = e_limits.set().get().maximum() - 1;
+        if ( meas_max_idx_.set < 0 ) meas_max_idx_.set = 0;
+    }
+    else
+    {
+        meas_max_idx_.set = 0;
+    }
+
+    if (e_limits.phase().present())
+    {
+        meas_max_idx_.phase = e_limits.phase().get().maximum()-1;
+        if ( meas_max_idx_.phase < 0 ) meas_max_idx_.phase = 0;
+    }
+    else
+    {
+        meas_max_idx_.phase = 0;
+    }
+
+    if (e_limits.kspace_encoding_step_2().present())
+    {
+        meas_max_idx_.kspace_encode_step_2 = e_limits.kspace_encoding_step_2().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.kspace_encode_step_2 = 0;
+    }
+
+    if (e_limits.contrast().present())
+    {
+        meas_max_idx_.contrast = e_limits.contrast().get().maximum()-1;
+        if ( meas_max_idx_.contrast < 0 ) meas_max_idx_.contrast = 0;
+    }
+    else
+    {
+        meas_max_idx_.contrast = 0;
+    }
+
+    if (e_limits.slice().present())
+    {
+        meas_max_idx_.slice = e_limits.slice().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.slice = 0;
+    }
+
+    if (e_limits.repetition().present())
+    {
+        meas_max_idx_.repetition = e_limits.repetition().get().maximum();
+    }
+    else
+    {
+        meas_max_idx_.repetition = 0;
+    }
+
+    if (e_limits.average().present())
+    {
+        meas_max_idx_.average = e_limits.average().get().maximum()-1;
+    }
+    else
+    {
+        meas_max_idx_.average = 0;
+    }
+
+    if (e_limits.segment().present())
+    {
+        // meas_max_idx_.segment = e_limits.segment().get().maximum()-1;
+        meas_max_idx_.segment = 0;
+    }
+    else
+    {
+        meas_max_idx_.segment = 0;
+    }
+
+    // find out the PAT mode
+    ISMRMRD::ismrmrdHeader::parallelImaging_optional p_imaging_type = cfg->parallelImaging();
+    ISMRMRD::parallelImagingType p_imaging = *p_imaging_type;
+
+    acceFactorE1_ = (long)(p_imaging.accelerationFactor().kspace_encoding_step_1());
+    acceFactorE2_ = (long)(p_imaging.accelerationFactor().kspace_encoding_step_2());
+    GADGET_CONDITION_MSG(verboseMode_, "acceFactorE1 is " << acceFactorE1_);
+    GADGET_CONDITION_MSG(verboseMode_, "acceFactorE2 is " << acceFactorE2_);
+
+    ISMRMRD::calibrationModeType::value calib = *(p_imaging.calibrationMode());
+
+    bool separate_ = (calib == ISMRMRD::calibrationModeType::separate);
+    bool embedded_ = (calib == ISMRMRD::calibrationModeType::embedded);
+    bool interleaved_ = (calib == ISMRMRD::calibrationModeType::interleaved);
+    bool other_ = (calib == ISMRMRD::calibrationModeType::other);
+
+    if ( separate_ ) { GADGET_CONDITION_MSG(verboseMode_, "Colibration mode is separate"); }
+    if ( embedded_ ) { GADGET_CONDITION_MSG(verboseMode_, "Colibration mode is embedded"); }
+    if ( interleaved_ ) { GADGET_CONDITION_MSG(verboseMode_, "Colibration mode is interleaved"); }
+    if ( other_ ) { GADGET_CONDITION_MSG(verboseMode_, "Colibration mode is other"); }
+
+    if ( other_ && acceFactorE1_==1 && acceFactorE2_==1 )
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "Colibration mode is changed to ISMRMRD_interleaved");
+        CalibMode_ = Gadgetron::gtPlus::ISMRMRD_interleaved;
+        acceFactorE1_ = 2;
+    }
+
+    if ( interleaved_ )
+    {
+        CalibMode_ = Gadgetron::gtPlus::ISMRMRD_interleaved;
+
+        if ( p_imaging.interleavingDimension().present() )
+        {
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::phase )
+            {
+                InterleaveDim_ = Gadgetron::gtPlus::DIM_Phase;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::repetition )
+            {
+                InterleaveDim_ = Gadgetron::gtPlus::DIM_Repetition;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::average )
+            {
+                InterleaveDim_ = Gadgetron::gtPlus::DIM_Average;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::contrast )
+            {
+                InterleaveDim_ = Gadgetron::gtPlus::DIM_Contrast;
+            }
+
+            if ( *(p_imaging.interleavingDimension()) == ISMRMRD::interleavingDimensionType::other )
+            {
+                InterleaveDim_ = Gadgetron::gtPlus::DIM_other1;
+            }
+
+            GADGET_CONDITION_MSG(verboseMode_, "InterleaveDim is " << gtPlus_util_.getISMRMRDDimName(InterleaveDim_));
+        }
+    }
+
+    if ( embedded_ )
+    {
+        CalibMode_ = Gadgetron::gtPlus::ISMRMRD_embedded;
+    }
+
+    if ( separate_ )
+    {
+        CalibMode_ = Gadgetron::gtPlus::ISMRMRD_separate;
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::external )
+    {
+        CalibMode_ = Gadgetron::gtPlus::ISMRMRD_external;
+    }
+
+    if ( calib == ISMRMRD::calibrationModeType::other )
+    {
+        CalibMode_ = Gadgetron::gtPlus::ISMRMRD_other;
+    }
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder_, debugFolder_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    if ( !debugFolder2_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder2_, debugFolder2_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder2 is not set ...");
+    }
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconGadget::
+generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath)
+{
+    debugFolderPath = ACE_OS::getenv("GADGETRON_HOME");
+    debugFolderPath.append("/");
+    debugFolderPath.append(debugFolder);
+    debugFolderPath.append("/");
+    GADGET_CONDITION_MSG(verboseMode_, "Debug folder is " << debugFolderPath);
+    return true;
+}
+
+void GtPlusReconGadget::
+getCurrentMoment(std::string& procTime)
+{
+    char timestamp[100];
+    time_t mytime;
+    struct tm *mytm;
+    mytime=time(NULL);
+    mytm=localtime(&mytime);
+    strftime(timestamp, sizeof(timestamp),"_%a_%d_%b_%Y_%H_%M_%S",mytm);
+    procTime = timestamp;
+}
+
+int GtPlusReconGadget::process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2)
+{
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconGadget::process(...) starts ... ");
+
+    processed_called_times_++;
+
+    GtPlusGadgetImageArray* images = m1->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->data_.get_dimensions();
+
+    GADGET_CONDITION_MSG(verboseMode_, "[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+    dimensions_ = *dims;
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconGadget::process(...) ends ... ");
+
+    m1->release();
+    return GADGET_OK;
+}
+
+int GtPlusReconGadget::computeSeriesImageNumber (ISMRMRD::ImageHeader& imheader, size_t nCHA, size_t cha, size_t nE2, size_t e2)
+{
+    int nSET = meas_max_idx_.set+1;
+    int nREP = meas_max_idx_.repetition+1;
+    int nPHS = meas_max_idx_.phase+1;
+    int nSLC = meas_max_idx_.slice+1;
+    int nCON = meas_max_idx_.contrast+1;
+    if ( nE2 == 0 ) nE2 = 1;
+
+    int imageNum = imheader.repetition*nSET*nPHS*nCON*nSLC*nE2*nCHA 
+                    + imheader.set*nPHS*nCON*nSLC*nE2*nCHA 
+                    + imheader.phase*nCON*nSLC*nE2*nCHA 
+                    + imheader.contrast*nSLC*nE2*nCHA
+                    + imheader.slice*nE2*nCHA 
+                    + e2*nCHA 
+                    + cha 
+                    + 1;
+
+    return imageNum;
+}
+
+bool GtPlusReconGadget::
+addPrePostZeros(int centreNo, int sampleNo, int& PrePostZeros)
+{
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    PrePostZeros = 0;
+
+    if ( sampleNo <= 1 )
+        return true;
+
+    if ( 2*centreNo == sampleNo )
+    {
+        PrePostZeros = 0;
+    }
+
+    if ( 2*centreNo < sampleNo )
+    {
+        PrePostZeros = 1;
+    }
+
+    if ( 2*centreNo > sampleNo )
+    {
+        PrePostZeros = 2;
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::
+scalingMagnitude(hoNDArray<float>& mag)
+{
+    if ( scalingFactor_ < 0 && !use_constant_scalingFactor_ )
+    {
+        // perform the scaling to [0 max_inten_value_]
+        size_t ind;
+        float maxInten;
+
+        size_t RO = mag.get_size(0);
+        size_t E1 = mag.get_size(1);
+        size_t num = mag.get_number_of_elements()/(RO*E1);
+
+        if ( num <= 24 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::maxAbsolute(mag, maxInten, ind));
+        }
+        else
+        {
+            hoNDArray<float> magPartial(RO, E1, 24, mag.get_data_ptr()+(num/2 - 12)*RO*E1);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::maxAbsolute(magPartial, maxInten, ind));
+        }
+        if ( maxInten < FLT_EPSILON ) maxInten = 1.0f;
+
+        if ( (maxInten<min_intensity_value_) || (maxInten>max_intensity_value_) )
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Using the dynamic intensity scaling factor - may not have noise prewhitening performed ... ");
+            scalingFactor_ = (float)(max_intensity_value_US_)/maxInten;
+        }
+        else
+        {
+            GADGET_CONDITION_MSG(verboseMode_, "Using the fixed intensity scaling factor - must have noise prewhitening performed ... ");
+            scalingFactor_ = SNR_NOISEFLOOR_SCALEFACTOR;
+
+            while ( (maxInten*scalingFactor_ > max_intensity_value_) && (scalingFactor_>=2) )
+            {
+                scalingFactor_ /= 2;
+            }
+
+            if (maxInten*scalingFactor_ > max_intensity_value_)
+            {
+                GADGET_CONDITION_MSG(verboseMode_, "The fixed intensity scaling factor leads to dynamic range overflow - switch to dyanmic intensity scaling ... ");
+                scalingFactor_ = (float)(max_intensity_value_)/maxInten;
+            }
+
+            use_constant_scalingFactor_ = true;
+        }
+
+        GADGET_CONDITION_MSG(verboseMode_, "scalingFactor_ : " << scalingFactor_);
+        GADGET_CHECK_RETURN_FALSE(scal((float)scalingFactor_, mag));
+    }
+    else
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "Using the fixed intensity scaling factor - scaling factor has been preset to be : " << scalingFactor_ << " ... ");
+        GADGET_CHECK_RETURN_FALSE(scal((float)scalingFactor_, mag));
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::
+generateKSpaceFilter(WorkOrderType& workOrder)
+{
+    try
+    {
+        size_t RO = workOrder.data_.get_size(0);
+        size_t E1 = workOrder.data_.get_size(1);
+        size_t E2 = workOrder.data_.get_size(4);
+
+        size_t RO_ref = workOrder.ref_.get_size(0);
+        size_t E1_ref = workOrder.ref_.get_size(1);
+        size_t E2_ref = workOrder.ref_.get_size(4);
+
+        if ( workOrder.CalibMode_ == Gadgetron::gtPlus::ISMRMRD_interleaved )
+        {
+            RO_ref = RO;
+            E1_ref = E1;
+            E2_ref = E2;
+        }
+
+        // image data filter
+        if ( RO>1 && filterRO_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterRO_.create(RO);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(RO, workOrder.filterRO_, filterRO_type_, filterRO_sigma_, std::ceil(filterRO_width_*RO)));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterRO_, "filterRO");
+        }
+
+        if ( E1>1 && filterE1_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterE1_.create(E1);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E1, workOrder.filterE1_, filterE1_type_, filterE1_sigma_, std::ceil(filterE1_width_*E1)));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_, "filterE1");
+        }
+
+        if ( E2>1 && filterE2_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterE2_.create(E2);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E2, workOrder.filterE2_, filterE2_type_, filterE2_sigma_, std::ceil(filterE2_width_*E2)));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_, "filterE2");
+        }
+
+        // ref data filter
+        if ( workOrder.ref_.get_number_of_elements() > 0 )
+        {
+            size_t startRO(0), endRO(0), startE1(0), endE1(0), startE2(0), endE2(0);
+            if ( E2_ref == 1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion2D(workOrder.ref_, startRO, endRO, startE1, endE1));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion3D(workOrder.ref_, startRO, endRO, startE1, endE1, startE2, endE2));
+            }
+
+            if ( (workOrder.CalibMode_ == ISMRMRD_interleaved) || (workOrder.CalibMode_ == ISMRMRD_embedded) )
+            {
+                // use the image data sample range
+                startRO = workOrder.start_RO_; if ( startRO < 0 ) startRO=0;
+                endRO = workOrder.end_RO_; if ( endRO < 0 ) endRO = RO_ref-1;
+            }
+
+            if ( RO_ref > 1 && filterRO_ref_type_ != ISMRMRD_FILTER_NONE )
+            {
+                workOrder.filterRO_ref_.create(RO_ref);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO_ref, startRO, endRO, workOrder.filterRO_ref_, filterRO_ref_type_, filterRO_ref_sigma_, std::ceil(filterRO_ref_width_*RO_ref)));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterRO_ref_, "filterRO_ref");
+            }
+
+            if ( (workOrder.CalibMode_ == ISMRMRD_separate) || (workOrder.CalibMode_ == ISMRMRD_external) )
+            {
+                if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = endE1-startE1+1;
+                    workOrder.filterE1_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, std::ceil(filterE1_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_ref_, "filterE1_ref");
+                }
+
+                if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = endE2-startE2+1;
+                    workOrder.filterE2_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, std::ceil(filterE2_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_ref_, "filterE2_ref");
+                }
+            }
+            else
+            {
+                // this makes sure for interleaved and embedded, the kspace filter is applied at correct lines
+                if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = E1_ref;
+                    workOrder.filterE1_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE1, endE1, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, std::ceil(filterE1_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_ref_, "filterE1_ref");
+                }
+
+                if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = E2_ref;
+                    workOrder.filterE2_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE2, endE2, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, std::ceil(filterE2_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_ref_, "filterE2_ref");
+                }
+            }
+        }
+
+        // partial fourier handling filter
+        if ( RO>1 && workOrder.start_RO_>=0 && workOrder.end_RO_>0 )
+        {
+            workOrder.filterRO_partialfourier_.create(RO);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder.start_RO_, workOrder.end_RO_, workOrder.filterRO_partialfourier_, filterRO_pf_type_, std::ceil(filterRO_pf_width_*RO), filterRO_pf_densityComp_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterRO_partialfourier_, "filterRO_partialfourier");
+        }
+
+        if ( E1>1 && workOrder.start_E1_>=0 && workOrder.end_E1_>0 )
+        {
+            workOrder.filterE1_partialfourier_.create(E1);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder.start_E1_, workOrder.end_E1_, workOrder.filterE1_partialfourier_, filterE1_pf_type_, std::ceil(filterE1_pf_width_*E1), filterE1_pf_densityComp_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_partialfourier_, "filterE1_partialfourier");
+        }
+
+        if ( E2>1 && workOrder.start_E2_>=0 && workOrder.end_E2_>0 )
+        {
+            workOrder.filterE2_partialfourier_.create(E2);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E2, workOrder.start_E2_, workOrder.end_E2_, workOrder.filterE2_partialfourier_, filterE2_pf_type_, std::ceil(filterE2_pf_width_*E2), filterE2_pf_densityComp_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_partialfourier_, "filterE2_partialfourier");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconGadget::generateKSpaceFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::
+recomputeImageGeometry(GtPlusGadgetImageArray* images, GtPlusGadgetImageExt& imageHeader, int slc, int e2, int con, int phs, int rep, int set, int seg, int maxE2)
+{
+    size_t E2 = images->matrix_size[4];
+
+    // if FOV are the same, return the stored image header, take care of E2 resizing
+    //if ( GT_ABS(field_of_view_recon_[2]-field_of_view_encoding_[2])<0.1 )
+    //{
+    //    if ( maxE2 == E2 ) // no E2 resizing
+    //    {
+    //        int offset = images->get_offset(slc, e2, con, phs, rep, set, 0);
+    //        imageHeader = images->imageArray_[offset];
+    //    }
+    //    else
+    //    {
+    //        double e2_sampled = e2*E2/(double)maxE2;
+
+    //        size_t e2_lower = std::floor(e2_sampled);
+    //        if ( e2_lower >= E2 ) e2_lower = E2-1;
+
+    //        size_t e2_higher = std::ceil(e2_sampled);
+    //        if ( e2_higher >= E2 ) e2_higher = E2-1;
+
+    //        GtPlusGadgetImageExt imageHeader_lower, imageHeader_higher;
+
+    //        if ( e2_lower == e2_higher )
+    //        {
+    //            int offset = images->get_offset(slc, e2_lower, con, phs, rep, set, 0);
+    //            imageHeader.copy(images->imageArray_[offset]);
+    //        }
+    //        else
+    //        {
+    //            int offset_lower = images->get_offset(slc, e2_lower, con, phs, rep, set, 0);
+    //            imageHeader_lower.copy(images->imageArray_[offset_lower]);
+
+    //            int offset_higher = images->get_offset(slc, e2_higher, con, phs, rep, set, 0);
+    //            imageHeader_higher.copy(images->imageArray_[offset_higher]);
+
+    //            imageHeader = imageHeader_lower;
+    //            imageHeader.recomputeHeader(imageHeader_higher, e2_higher-e2_sampled);
+    //        }
+    //    }
+    //}
+    //else
+    //{
+        // need to recompute image geometry
+        // no need to consider RO and E1, because image position vector points to the image center
+
+        if ( e2 >= E2 ) e2 = E2/2;
+
+        int offsetCurr = images->get_offset(slc, e2, con, phs, rep, set, 0);
+        imageHeader = images->imageArray_[offsetCurr];
+
+        // find the center partition
+        if ( E2 > 1 )
+        {
+            int midE2 = E2/2;
+            int offset = images->get_offset(slc, midE2, con, phs, rep, set, 0);
+
+            while ( GT_ABS(imageHeader.slice_dir[0])<1e-6 && GT_ABS(imageHeader.slice_dir[1])<1e-6 && GT_ABS(imageHeader.slice_dir[2])<1e-6 )
+            {
+                imageHeader = images->imageArray_[offset];
+                midE2++;
+                offset = images->get_offset(slc, midE2, con, phs, rep, set, 0);
+            }
+
+            // position vector for the center partition
+            float posVec[3];
+            posVec[0] = imageHeader.position[0];
+            posVec[1] = imageHeader.position[1];
+            posVec[2] = imageHeader.position[2];
+
+            // slice direction
+            float sliceVec[3];
+            sliceVec[0] = imageHeader.slice_dir[0];
+            sliceVec[1] = imageHeader.slice_dir[1];
+            sliceVec[2] = imageHeader.slice_dir[2];
+
+            midE2 = E2/2;
+
+            // comput slice postion vector for this partition
+            float posVecCurr[3];
+            posVecCurr[0] = posVec[0] + aSpacing_[2]*sliceVec[0]*(e2-midE2+0.5);
+            posVecCurr[1] = posVec[1] + aSpacing_[2]*sliceVec[1]*(e2-midE2+0.5);
+            posVecCurr[2] = posVec[2] + aSpacing_[2]*sliceVec[2]*(e2-midE2+0.5);
+
+            imageHeader.position[0] = posVecCurr[0];
+            imageHeader.position[1] = posVecCurr[1];
+            imageHeader.position[2] = posVecCurr[2];
+
+            GADGET_CONDITION_MSG(verboseMode_, "--> image position : [" << imageHeader.position[0] << " , " << imageHeader.position[1] << " , " << imageHeader.position[2] << "]");
+
+            imageHeader.field_of_view[2] = aSpacing_[2];
+
+            imageHeader.user_int[0] = e2;
+        }
+
+        if ( imageHeader.measurement_uid == 0 )
+        {
+            GADGET_WARN_MSG("imageHeader.measurement_uid == 0");
+        }
+
+        /*double e2_sampled = 0;
+        double coord_in_encoding_space = field_of_view_recon_[2]*e2/maxE2 + (field_of_view_encoding_[2]/2-field_of_view_recon_[2]/2);
+        e2_sampled = E2 * coord_in_encoding_space/field_of_view_encoding_[2];
+
+        if ( e2_sampled < 0 )
+        {
+            int offset = images->get_offset(slc, 0, con, phs, rep, set, 0);
+            imageHeader.copy(images->imageArray_[offset]);
+        }
+        else if ( e2_sampled > E2-1 )
+        {
+            int offset = images->get_offset(slc, E2-1, con, phs, rep, set, 0);
+            imageHeader.copy(images->imageArray_[offset]);
+        }
+        else
+        {
+            size_t e2_lower = std::floor(e2_sampled);
+            if ( e2_lower >= E2 ) e2_lower = E2-1;
+
+            size_t e2_higher = std::ceil(e2_sampled);
+            if ( e2_higher >= E2 ) e2_higher = E2-1;
+
+            GtPlusGadgetImageExt imageHeader_lower, imageHeader_higher;
+
+            if ( e2_lower == e2_higher )
+            {
+                int offset = images->get_offset(slc, e2_lower, con, phs, rep, set, 0);
+                imageHeader.copy(images->imageArray_[offset]);
+            }
+            else
+            {
+                int offset_lower = images->get_offset(slc, e2_lower, con, phs, rep, set, 0);
+                imageHeader_lower.copy(images->imageArray_[offset_lower]);
+
+                int offset_higher = images->get_offset(slc, e2_higher, con, phs, rep, set, 0);
+                imageHeader_higher.copy(images->imageArray_[offset_higher]);
+
+                imageHeader.copy(imageHeader_lower);
+                imageHeader.recomputeHeader(imageHeader_higher, e2_higher-e2_sampled);
+            }
+        }*/
+    //}
+
+    return true;
+}
+
+bool GtPlusReconGadget::
+sendOutReconMag(GtPlusGadgetImageArray* images, const hoNDArray<float>& res, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = res.get_dimensions();
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t CHA = (*dims)[2];
+        size_t SLC = (*dims)[3];
+        size_t E2 = (*dims)[4];
+        size_t CON = (*dims)[5];
+        size_t PHS = (*dims)[6];
+        size_t REP = (*dims)[7];
+        size_t SET = (*dims)[8];
+
+        GADGET_CONDITION_MSG(true, "sending out images, acquisition boundary [RO E1 CHA SLC E2 CON PHS REP SET] = [" 
+                                                                      << RO << " " << E1 << " " << CHA << " " 
+                                                                      << SLC << " " << E2 << " " << CON << " " 
+                                                                      << PHS << " " << REP << " " << SET << "] " );
+
+        size_t set(0), rep(0), phs(0), con(0), e2(0), slc(0), cha(0), seg(0);
+        // size_t set_sInd(0), rep_sInd(0), phs_sInd(0), con_sInd(0), e2_sInd(0), slc_sInd(0);
+
+        for ( set=0; set<SET; set++ )
+        {
+            // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findDimIndex(dimStartingIndexes, DIM_Set, set_sInd));
+
+            for ( rep=0; rep<REP; rep++ )
+            {
+                // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findDimIndex(dimStartingIndexes, DIM_Repetition, rep_sInd));
+
+                for ( phs=0; phs<PHS; phs++ )
+                {
+                    // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findDimIndex(dimStartingIndexes, DIM_Phase, phs_sInd));
+
+                    for ( con=0; con<CON; con++ )
+                    {
+                        // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findDimIndex(dimStartingIndexes, DIM_Contrast, con_sInd));
+
+                        for ( e2=0; e2<E2; e2++ )
+                        {
+                            // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findDimIndex(dimStartingIndexes, DIM_Encoding2, e2_sInd));
+
+                            for ( slc=0; slc<SLC; slc++ )
+                            {
+                                // GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findDimIndex(dimStartingIndexes, DIM_Slice, slc_sInd));
+
+                                GtPlusGadgetImageExt imageHeaderSent;
+                                GADGET_CHECK_RETURN_FALSE(recomputeImageGeometry(images, imageHeaderSent, slc, e2, con, phs, rep, set, 0, E2));
+
+                                //int offset = images->get_offset(slc, e2, con, phs, rep, set, 0);
+                                //imageHeaderSent = images->imageArray_[offset];
+
+                                if ( imageHeaderSent.measurement_uid == 0 )
+                                {
+                                    continue;
+                                }
+
+                                for ( cha=0; cha<CHA; cha++ )
+                                {
+                                    Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = new Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>();
+                                    *cm1->getObjectPtr() = imageHeaderSent;
+
+                                    cm1->getObjectPtr()->flags = 0;
+                                    cm1->getObjectPtr()->image_data_type = ISMRMRD::DATA_FLOAT;
+                                    cm1->getObjectPtr()->image_type = ISMRMRD::TYPE_MAGNITUDE;
+
+                                    // image number and image series
+                                    cm1->getObjectPtr()->image_index = computeSeriesImageNumber (*cm1->getObjectPtr(), CHA, cha, E2, e2);
+                                    cm1->getObjectPtr()->image_series_index = seriesNum;
+                                    // GADGET_CONDITION_MSG(verboseMode_, "image number " << cm1->getObjectPtr()->image_index << "    image series " << cm1->getObjectPtr()->image_series_index << " ... ");
+
+                                    // set the time stamp
+                                    // the time stamp of the first readout line in this 2D kspace is used
+
+                                    Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<float> >* cm2 = new Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<float> >();
+                                    cm1->cont(cm2);
+
+                                    std::vector<size_t> img_dims(2);
+                                    img_dims[0] = RO;
+                                    img_dims[1] = E1;
+
+                                    //Fixing array dimensions (MSH)
+                                    cm1->getObjectPtr()->matrix_size[0] = RO;
+                                    cm1->getObjectPtr()->matrix_size[1] = E1;
+                                    cm1->getObjectPtr()->matrix_size[2] = 1;
+                                    cm1->getObjectPtr()->channels = 1;
+
+                                    try
+                                    {
+                                        cm2->getObjectPtr()->create(&img_dims);
+                                        Gadgetron::clear(cm2->getObjectPtr());
+                                    }
+                                    catch(...)
+                                    {
+                                        GADGET_DEBUG1("Unable to allocate new image\n");
+                                        cm1->release();
+                                        return false;
+                                    }
+
+                                    std::vector<size_t> ind(9, 0);
+                                    ind[2] = cha;
+                                    ind[3] = slc;
+                                    ind[4] = e2;
+                                    ind[5] = con;
+                                    ind[6] = phs;
+                                    ind[7] = rep;
+                                    ind[8] = set;
+
+                                    memcpy(cm2->getObjectPtr()->begin(), res.begin()+res.calculate_offset(ind), sizeof(float)*RO*E1);
+
+                                    if ( !debugFolder2_fullPath_.empty() )
+                                    {
+                                        std::ostringstream ostr;
+                                        ostr << prefix << "_" << cm1->getObjectPtr()->image_index;
+                                        GADGET_EXPORT_ARRAY(debugFolder2_fullPath_, gt_exporter_, *cm2->getObjectPtr(), ostr.str());
+
+                                        //hoNDArray<unsigned short> imageUS2D;
+                                        //imageUS2D.copyFrom(*cm2->getObjectPtr());
+                                        //std::ostringstream ostr2;
+                                        //ostr2 << prefix << "_US_" << cm1->getObjectPtr()->image_index;
+                                        //GADGET_EXPORT_ARRAY(debugFolder2_fullPath_, gt_exporter_, imageUS2D, ostr2.str());
+                                    }
+
+                                    GADGET_CONDITION_MSG(true, "sending out 2D image [CHA SLC E2 CON PHS REP SET] = [" 
+                                                                      << cha << " " 
+                                                                      << cm1->getObjectPtr()->slice << " " 
+                                                                      << e2 << " " 
+                                                                      << cm1->getObjectPtr()->contrast << " " 
+                                                                      << cm1->getObjectPtr()->phase << " " 
+                                                                      << cm1->getObjectPtr()->repetition << " " 
+                                                                      << cm1->getObjectPtr()->set << "] \t" 
+                                                                      << " -- Image number -- " << cm1->getObjectPtr()->image_index);
+
+                                    // send out the images
+                                    if (this->next()->putq(cm1) < 0) 
+                                    {
+                                        return false;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconGadget::sendOutReconMag(float) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::
+sendOutRecon(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix)
+{
+    try
+    {
+        hoNDArray<float> mag(res.get_dimensions());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(res, mag));
+        GADGET_CHECK_RETURN_FALSE(scalingMagnitude(mag));
+        GADGET_CHECK_RETURN_FALSE(this->sendOutReconMag(images, mag, seriesNum, dimStartingIndexes, prefix));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconGadget::sendOutRecon(ValueType) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, int imageNum)
+{
+    try
+    {
+        // extract the magnitude
+        hoNDArray<float> mag(res.get_dimensions());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(res, mag));
+        GADGET_CHECK_RETURN_FALSE(scalingMagnitude(mag));
+        GADGET_CHECK_RETURN_FALSE(sendOutRecon2D(images, mag, seriesNum, imageNum));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Exceptions happened in GtPlusReconGadget::sendOutRecon2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconGadget::sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<float>& res, int seriesNum, int imageNum)
+{
+    try
+    {
+        Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = new Gadgetron::GadgetContainerMessage<ISMRMRD::ImageHeader>();
+        *cm1->getObjectPtr() = images->imageArray_[0];
+
+        cm1->getObjectPtr()->flags = 0;
+        cm1->getObjectPtr()->image_data_type = ISMRMRD::DATA_FLOAT;
+        cm1->getObjectPtr()->image_type = ISMRMRD::TYPE_MAGNITUDE;
+
+        // image number and image series
+        cm1->getObjectPtr()->image_index = imageNum;
+        cm1->getObjectPtr()->image_series_index = seriesNum;
+
+        Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<float> >* cm2 = new Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray<float> >();
+        cm1->cont(cm2);
+
+        std::vector<size_t> img_dims(2);
+        img_dims[0] = res.get_size(0);
+        img_dims[1] = res.get_size(1);
+
+        //Fixing array dimensions (MSH)
+        cm1->getObjectPtr()->matrix_size[0] = res.get_size(0);
+        cm1->getObjectPtr()->matrix_size[1] = res.get_size(1);
+        cm1->getObjectPtr()->matrix_size[2] = 1;
+        cm1->getObjectPtr()->channels = 1;
+
+        try
+        {
+            cm2->getObjectPtr()->create(&img_dims);
+        }
+        catch(...)
+        {
+            GADGET_DEBUG1("Unable to allocate new image\n");
+            cm1->release();
+            return false;
+        }
+
+        memcpy(cm2->getObjectPtr()->begin(), res.begin(), sizeof(float)*res.get_size(0)*res.get_size(1));
+
+        if ( !debugFolder2_fullPath_.empty() )
+        {
+            std::ostringstream ostr;
+            ostr << "SentImage2D" << "_" << cm1->getObjectPtr()->image_index;
+            GADGET_EXPORT_ARRAY(debugFolder2_fullPath_, gt_exporter_, *cm2->getObjectPtr(), ostr.str());
+        }
+
+        // send out the images
+        if (this->next()->putq(cm1) < 0) 
+        {
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconGadget::sendOutRecon2D(float) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconGadget.h b/gadgets/gtPlus/GtPlusReconGadget.h
new file mode 100644
index 0000000..7463f93
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconGadget.h
@@ -0,0 +1,271 @@
+/** \file   GtPlusReconGadget.h
+    \brief  This is the base class gadget for both 2DT and 3DT reconstruction.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "hoNDArray_utils.h"
+
+#include "GtPlusGadgetImageArray.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+
+#include "GadgetStreamController.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+#define SNR_NOISEFLOOR_SCALEFACTOR 8
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconGadget : public Gadgetron::Gadget2< GtPlusGadgetImageArray, Gadgetron::gtPlus::gtPlusReconWorkOrder<std::complex<float> > >
+{
+public:
+    GADGET_DECLARE(GtPlusReconGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    typedef Gadget2< GtPlusGadgetImageArray, WorkOrderType > BaseClass;
+
+    typedef std::pair<Gadgetron::gtPlus::ISMRMRDDIM, size_t> DimensionRecordType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType>::CloudNodeType CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    GtPlusReconGadget();
+    ~GtPlusReconGadget();
+
+    // image series number
+    int image_series_;
+
+    // the min/max dynamic range of magnitude images
+    size_t min_intensity_value_;
+    size_t max_intensity_value_;
+
+    // maximal intensity value when converted to unsigned short
+    size_t max_intensity_value_US_;
+
+    // scaling factor for recon results
+    double scalingFactor_;
+
+    // whether to use the fixed intensity scaling factor
+    bool use_constant_scalingFactor_;
+
+    // time stamp resolution (default, 2.5ms)
+    float timeStampResolution_;
+
+    // pixel spacing when exporting the images
+    double aSpacing_[6];
+
+    // field of view in mm
+    double FOV_RO_;
+    double FOV_E1_;
+    double FOV_E2_;
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // debug folder 2
+    std::string debugFolder2_;
+    std::string debugFolder2_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+    // whether to recon kspace
+    bool recon_kspace_needed_;
+
+    // parameters for gt-plus recon
+    Gadgetron::gtPlus::gtPlusReconWorkOrderPara workOrderPara_;
+
+    // --------------------------------------------------
+    // utility functions
+    // --------------------------------------------------
+
+    // generate the debug folder path
+    // debugFolderPath = ${GADGETRON_HOME}/debugFolder
+    virtual bool generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath);
+
+    // get the current moment
+    void getCurrentMoment(std::string& procTime);
+
+    // compute image number using ICE way
+    int computeSeriesImageNumber (ISMRMRD::ImageHeader& imheader, size_t nCHA=1, size_t cha=0, size_t nE2=1, size_t e2=0);
+
+    // to handle partial fourier, add pre or post zeros
+    // PrePostZeros: 0 no zeros; 1 pre zeros; 2 post zeros
+    bool addPrePostZeros(int centreNo, int sampleNo, int& PrePostZeros);
+
+    // find the dimension index
+    bool findStartingDimIndex(const std::vector<DimensionRecordType>& dimStartingIndexes, Gadgetron::gtPlus::ISMRMRDDIM& dim, size_t ind);
+
+    // scale the magnitude images
+    bool scalingMagnitude(hoNDArray<float>& mag);
+
+    // recompute the image geometry parameters if the recon FOV is different from encoding FOV
+    bool recomputeImageGeometry(GtPlusGadgetImageArray* images, GtPlusGadgetImageExt& imageHeader, int slc, int e2, int con, int phs, int rep, int set, int seg, int maxE2);
+
+    // send out the recon results
+    virtual bool sendOutReconMag(GtPlusGadgetImageArray* images, const hoNDArray<float>& res, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix);
+    virtual bool sendOutRecon(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, const std::vector<DimensionRecordType>& dimStartingIndexes, const std::string& prefix);
+
+    // special sending function for the interactive cases
+    virtual bool sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<float>& res, int seriesNum, int imageNum);
+    virtual bool sendOutRecon2D(GtPlusGadgetImageArray* images, const hoNDArray<ValueType>& res, int seriesNum, int imageNum);
+
+    // compute the kspace filter
+    bool generateKSpaceFilter(WorkOrderType& workOrder);
+
+protected:
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< GtPlusGadgetImageArray >* m1, Gadgetron::GadgetContainerMessage< WorkOrderType > * m2);
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // parse the cloud file if any
+    virtual bool parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud);
+
+public:
+
+    // --------------------------------------------------
+    // variables used for data buffer and processing
+    // --------------------------------------------------
+
+    // dimension of incoming array
+    std::vector<size_t> dimensions_;
+
+    // number of acquisition channels
+    size_t num_acq_channels_;
+
+    // encoding matrix size (the real sampled size)
+    size_t matrix_size_encoding_[3];
+
+    // encoding filed of view [mm]
+    float field_of_view_encoding_[3];
+
+    // recon matrix size (the final image size)
+    size_t matrix_size_recon_[3];
+
+    // recon filed of view [mm]
+    float field_of_view_recon_[3];
+
+    // number of E1/E2 after zero-filling resize
+    size_t reconE1_;
+    size_t reconE2_;
+
+    // acceleration factor
+    double acceFactorE1_;
+    double acceFactorE2_;
+
+    // calibration mode
+    Gadgetron::gtPlus::ISMRMRDCALIBMODE CalibMode_;
+    Gadgetron::gtPlus::ISMRMRDDIM InterleaveDim_;
+
+    // acquired max indexes
+    size_t kSpaceMaxAcqE1No_;
+    size_t kSpaceMaxAcqE2No_;
+
+    // number of times the process function is called
+    unsigned int processed_called_times_;
+
+    // kspace filter for RO/E1/E2
+    // for the partial fourier, zero-padding resize or asymmetric echo
+    // if the kspace filter is not selected, the default filter will be used anyway
+
+    // kspace filter
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterRO_type_;
+    double filterRO_sigma_;
+    double filterRO_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE1_type_;
+    double filterE1_sigma_;
+    double filterE1_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE2_type_;
+    double filterE2_sigma_;
+    double filterE2_width_;
+
+    // ref data filter
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterRO_ref_type_;
+    double filterRO_ref_sigma_;
+    double filterRO_ref_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE1_ref_type_;
+    double filterE1_ref_sigma_;
+    double filterE1_ref_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE2_ref_type_;
+    double filterE2_ref_sigma_;
+    double filterE2_ref_width_;
+
+    // partial fourier filter
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterRO_pf_type_;
+    double filterRO_pf_sigma_;
+    double filterRO_pf_width_;
+    bool filterRO_pf_densityComp_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE1_pf_type_;
+    double filterE1_pf_sigma_;
+    double filterE1_pf_width_;
+    bool filterE1_pf_densityComp_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE2_pf_type_;
+    double filterE2_pf_sigma_;
+    double filterE2_pf_width_;
+    bool filterE2_pf_densityComp_;
+
+    /// cloud related definition
+    bool CloudComputing_;
+    unsigned int CloudSize_;
+
+    CloudType gt_cloud_;
+
+    // cloud node file
+    std::string cloud_node_file_;
+
+    // encoding space size
+    ISMRMRD::EncodingCounters meas_max_idx_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+
+    // memory manager
+    boost::shared_ptr<Gadgetron::gtPlus::gtPlusMemoryManager> mem_manager_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp b/gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp
new file mode 100644
index 0000000..298ac69
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadget.cpp
@@ -0,0 +1,234 @@
+
+#include "GtPlusReconJob2DTGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconJob2DTGadget::GtPlusReconJob2DTGadget() : mem_manager_(new Gadgetron::gtPlus::gtPlusMemoryManager(4, 640*1024*1024))
+{
+    debugFolder_ = "DebugOutput";
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    process_config_called_ = false;
+
+    Gadgetron::prepOpenMP();
+    Gadgetron::prepMKL();
+}
+
+GtPlusReconJob2DTGadget::~GtPlusReconJob2DTGadget()
+{
+
+}
+
+bool GtPlusReconJob2DTGadget::readParameters()
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlusReconJob2DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GADGET_CONDITION_MSG(verboseMode_, "performTiming_ is " << performTiming_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob2DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusReconJob2DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder_, debugFolder_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Pre-allocate memory ... ", performTiming_);
+    mem_manager_->increase(4.0*1024*1024*1024);
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    worker_grappa_.gtPlus_mem_manager_ = mem_manager_;
+    worker_noacceleration_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_L1_ncg_.gtPlus_mem_manager_ = mem_manager_;
+
+    return GADGET_OK;
+}
+
+int GtPlusReconJob2DTGadget::process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2)
+{
+    // because the parameter configuration will not be sent, we need to call process_config explicitly
+    if ( !process_config_called_ )
+    {
+        GADGET_CHECK_RETURN( (this->process_config(m1)==0), GADGET_FAIL);
+        process_config_called_ = true;
+    }
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob2DTGadget::process(...) starts ... ");
+
+    int* jobID = m1->getObjectPtr();
+    GADGET_CONDITION_MSG(verboseMode_, "--> arriving job : " << *jobID << " ... ");
+
+    GtPlusReconJobTypeCPFL* job = m2->getObjectPtr();
+    GADGET_CONDITION_MSG(verboseMode_, "    job array size : [ " << job->kspace.get_size(0) << " " 
+                                                                 << job->kspace.get_size(1) << " " 
+                                                                 << job->kspace.get_size(2) << " " 
+                                                                 << job->kspace.get_size(3) << " ] ... ");
+
+    // set the worker
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( verboseMode_ )
+    {
+        job->workOrder2DT.print(std::cout);
+    }
+
+    bool succeed = true;
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Recon 2DT job ... ", performTiming_);
+
+    succeed = worker_spirit_L1_ncg_.performUnwarppingImpl(*job);
+
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    // export the results
+    if ( !debugFolder_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "ReconJob2DT_ID" << *jobID;
+
+        hoNDArray<GT_Complex8> res = job->res;
+        res.squeeze();
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, res, ostr.str());
+    }
+
+    // clean the kspace and ker and coil map
+    job->kspace.clear();
+    job->ker.clear();
+    if ( !job->workOrder2DT.coilMap_ ) job->workOrder2DT.coilMap_->clear();
+
+    if ( !succeed )
+    {
+        job->complexIm.clear();
+        job->res.clear();
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutJob(*jobID, job), GADGET_FAIL);
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob2DTGadget::process(...) ends ... ");
+
+    m1->release();
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob2DTGadget::
+sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job)
+{
+    try
+    {
+        ACE_DEBUG( (LM_INFO, ACE_TEXT("GtPlusReconJob2DTGadget sendOutJob ... ")) );
+
+        if (!this->controller_)
+        {
+            ACE_DEBUG( (LM_DEBUG, ACE_TEXT("Cannot return result to controller, no controller set")) );
+            return false;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_CLOUD_JOB;
+
+        GadgetContainerMessage<int>* m1 = new GadgetContainerMessage<int>();
+        *(m1->getObjectPtr()) = jobID;
+
+        GadgetContainerMessage<GtPlusReconJobTypeCPFL>* m2 = new GadgetContainerMessage<GtPlusReconJobTypeCPFL>();
+
+        *(m2->getObjectPtr()) = *job;
+
+        m1->cont(m2);
+        mb->cont(m1);
+
+        int ret =  this->controller_->output_ready(mb);
+        if (ret < 0)
+        {
+            GADGET_DEBUG1("Failed to return GtPlusReconJob2DTGadget job massage to controller\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob2DTGadget::sendOutJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconJob2DTGadget::
+    generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath)
+{
+    debugFolderPath = ACE_OS::getenv("GADGETRON_HOME");
+    debugFolderPath.append("/");
+    debugFolderPath.append(debugFolder);
+    debugFolderPath.append("/");
+    GADGET_CONDITION_MSG(verboseMode_, "Debug folder is " << debugFolderPath);
+    return true;
+}
+
+void GtPlusReconJob2DTGadget::
+    getCurrentMoment(std::string& procTime)
+{
+    char timestamp[100];
+    time_t mytime;
+    struct tm *mytm;
+    mytime=time(NULL);
+    mytm=localtime(&mytime);
+    strftime(timestamp, sizeof(timestamp),"_%a_%d_%b_%Y_%H_%M_%S",mytm);
+    procTime = timestamp;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconJob2DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadget.h b/gadgets/gtPlus/GtPlusReconJob2DTGadget.h
new file mode 100644
index 0000000..654e16e
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadget.h
@@ -0,0 +1,117 @@
+/** \file   GtPlusReconJob2DTGadget.h
+    \brief  This is a cloud gadget performing the computation for 2DT job data package.
+
+            This gadget can either serve as the working gadget for the signle layer cloud, or it can work as the 
+            second layer gadget for the dual layer cloud.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+            Magenetic Resonance in Medicine on Dec 2013.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+#include "gtPlusMemoryManager.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconJob2DTGadget : public Gadgetron::Gadget2< int, GtPlusReconJobTypeCPFL >
+{
+public:
+    GADGET_DECLARE(GtPlusReconJob2DTGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    typedef Gadget2< int, GtPlusReconJobTypeCPFL > BaseClass;
+
+    GtPlusReconJob2DTGadget();
+    ~GtPlusReconJob2DTGadget();
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // --------------------------------------------------
+    // utility functions
+    // --------------------------------------------------
+
+    // generate the debug folder path
+    // debugFolderPath = ${GADGETRON_HOME}/debugFolder
+    virtual bool generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath);
+
+    // get the current moment
+    void getCurrentMoment(std::string& procTime);
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2);
+
+    // process config is only to be called once
+    bool process_config_called_;
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // send the completed job
+    bool sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job);
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+
+    // memory manager
+    boost::shared_ptr<Gadgetron::gtPlus::gtPlusMemoryManager> mem_manager_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp
new file mode 100644
index 0000000..10b6220
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.cpp
@@ -0,0 +1,790 @@
+
+#include "GtPlusReconJob2DTGadgetCloud.h"
+#include "GtPlusGadgetOpenMP.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconJob2DTGadgetCloud::GtPlusReconJob2DTGadgetCloud() : mem_manager_(new Gadgetron::gtPlus::gtPlusMemoryManager(4, 640*1024*1024))
+{
+    debugFolder_ = "DebugOutput";
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    filterRO_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterRO_sigma_ = 1.5;
+    filterRO_width_ = 0.15;
+
+    filterE1_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterE1_sigma_ = 1.5;
+    filterE1_width_ = 0.15;
+
+    filterE2_type_ = ISMRMRD_FILTER_GAUSSIAN;
+    filterE2_sigma_ = 1.5;
+    filterE2_width_ = 0.15;
+
+    filterRO_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterRO_ref_sigma_ = 1.5;
+    filterRO_ref_width_ = 0.15;
+
+    filterE1_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterE1_ref_sigma_ = 1.5;
+    filterE1_ref_width_ = 0.15;
+
+    filterE2_ref_type_ = ISMRMRD_FILTER_HANNING;
+    filterE2_ref_sigma_ = 1.5;
+    filterE2_ref_width_ = 0.15;
+
+    filterRO_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterRO_pf_sigma_ = 1.5;
+    filterRO_pf_width_ = 0.15;
+    filterRO_pf_densityComp_ = false;
+
+    filterE1_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterE1_pf_sigma_ = 1.5;
+    filterE1_pf_width_ = 0.15;
+    filterE1_pf_densityComp_ = false;
+
+    filterE2_pf_type_ = ISMRMRD_FILTER_HANNING;
+    filterE2_pf_sigma_ = 1.5;
+    filterE2_pf_width_ = 0.15;
+    filterE2_pf_densityComp_ = false;
+
+    process_config_called_ = false;
+
+    Gadgetron::prepOpenMP();
+    Gadgetron::prepMKL();
+}
+
+GtPlusReconJob2DTGadgetCloud::~GtPlusReconJob2DTGadgetCloud()
+{
+
+}
+
+bool GtPlusReconJob2DTGadgetCloud::readParameters()
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlusReconJob2DTGadgetCloud parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        str = this->get_string_value("debugFolder2");
+        debugFolder2_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder2_ is " << debugFolder2_);
+
+        str = this->get_string_value("cloudNodeFile");
+        cloud_node_file_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "cloud_node_file_ is " << cloud_node_file_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GADGET_CONDITION_MSG(verboseMode_, "performTiming_ is " << performTiming_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        // kspace filter parameters
+        str = this->get_string_value("filterRO");
+        filterRO_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_sigma_ = this->get_double_value("filterRO_sigma");
+        filterRO_width_ = this->get_double_value("filterRO_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_sigma_ is " << filterRO_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_width_ is " << filterRO_width_);
+
+        str = this->get_string_value("filterE1");
+        filterE1_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_sigma_ = this->get_double_value("filterE1_sigma");
+        filterE1_width_ = this->get_double_value("filterE1_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_sigma_ is " << filterE1_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_width_ is " << filterE1_width_);
+
+        str = this->get_string_value("filterE2");
+        filterE2_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_sigma_ = this->get_double_value("filterE2_sigma");
+        filterE2_width_ = this->get_double_value("filterE2_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_sigma_ is " << filterE2_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_width_ is " << filterE2_width_);
+
+        str = this->get_string_value("filterRefRO");
+        filterRO_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_ref_sigma_ = this->get_double_value("filterRefRO_sigma");
+        filterRO_ref_width_ = this->get_double_value("filterRefRO_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_ref_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_ref_sigma_ is " << filterRO_ref_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_ref_width_ is " << filterRO_ref_width_);
+
+        str = this->get_string_value("filterRefE1");
+        filterE1_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_ref_sigma_ = this->get_double_value("filterRefE1_sigma");
+        filterE1_ref_width_ = this->get_double_value("filterRefE1_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_ref_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_ref_sigma_ is " << filterE1_ref_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_ref_width_ is " << filterE1_ref_width_);
+
+        str = this->get_string_value("filterRefE2");
+        filterE2_ref_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_ref_sigma_ = this->get_double_value("filterRefE2_sigma");
+        filterE2_ref_width_ = this->get_double_value("filterRefE2_width");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_ref_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_ref_sigma_ is " << filterE2_ref_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_ref_width_ is " << filterE2_ref_width_);
+
+        str = this->get_string_value("filterPartialFourierRO");
+        filterRO_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterRO_pf_sigma_ = this->get_double_value("filterPartialFourierRO_sigma");
+        filterRO_pf_width_ = this->get_double_value("filterPartialFourierRO_width");
+        filterRO_pf_densityComp_ = this->get_bool_value("filterPartialFourierRO_densityComp");
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_sigma_ is " << filterRO_pf_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_width_ is " << filterRO_pf_width_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterRO_pf_densityComp_ is " << filterRO_pf_densityComp_);
+
+        str = this->get_string_value("filterPartialFourierE1");
+        filterE1_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE1_pf_sigma_ = this->get_double_value("filterPartialFourierE1_sigma");
+        filterE1_pf_width_ = this->get_double_value("filterPartialFourierE1_width");
+        filterE1_pf_densityComp_ = this->get_bool_value("filterPartialFourierE1_densityComp");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_sigma_ is " << filterE1_pf_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_width_ is " << filterE1_pf_width_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE1_pf_densityComp_ is " << filterE1_pf_densityComp_);
+
+        str = this->get_string_value("filterPartialFourierE2");
+        filterE2_pf_type_ = gtPlus_util_.getISMRMRDKSpaceFilterFromName(*str);
+        filterE2_pf_sigma_ = this->get_double_value("filterPartialFourierE2_sigma");
+        filterE2_pf_width_ = this->get_double_value("filterPartialFourierE2_width");
+        filterE2_pf_densityComp_ = this->get_bool_value("filterPartialFourierE2_densityComp");
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_type_ is " << *str);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_sigma_ is " << filterE2_pf_sigma_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_width_ is " << filterE2_pf_width_);
+        GADGET_CONDITION_MSG(verboseMode_, "filterE2_pf_densityComp_ is " << filterE2_pf_densityComp_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        job_split_by_S_ = this->get_bool_value("job_split_by_S");
+        job_num_of_N_ = (size_t)(this->get_int_value("job_num_of_N"));
+        job_max_Megabytes_ = (size_t)(this->get_int_value("job_max_Megabytes"));
+        job_overlap_ = (size_t)(this->get_int_value("job_overlap"));
+
+        GADGET_CONDITION_MSG(verboseMode_, "job_split_by_S_ is " << job_split_by_S_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_num_of_N_ is " << job_num_of_N_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_max_Megabytes_ is " << job_max_Megabytes_);
+        GADGET_CONDITION_MSG(verboseMode_, "job_overlap_ is " << job_overlap_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+
+        CloudComputing_ = this->get_bool_value("CloudComputing");
+        CloudSize_ = (unsigned int)(this->get_int_value("CloudSize"));
+
+        GADGET_CONDITION_MSG(verboseMode_, "CloudComputing_ is " << CloudComputing_);
+        GADGET_CONDITION_MSG(verboseMode_, "CloudSize_ is " << CloudSize_);
+
+        // read in the cloud information for every node
+        gt_cloud_.resize(CloudSize_);
+
+        for ( unsigned int ii=0; ii<CloudSize_; ii++ )
+        {
+            std::ostringstream ostreamstr1;
+            ostreamstr1 << "CloudNode" << ii << "_IP" << std::ends;
+            boost::shared_ptr<std::string> IP = this->get_string_value(ostreamstr1.str().c_str());
+            gt_cloud_[ii].get<0>() = *IP;
+
+            std::ostringstream ostreamstr2;
+            ostreamstr2 << "CloudNode" << ii << "_Port" << std::ends;
+            boost::shared_ptr<std::string> Port = this->get_string_value(ostreamstr2.str().c_str());
+            gt_cloud_[ii].get<1>() = *Port;
+
+            std::ostringstream ostreamstr3;
+            ostreamstr3 << "CloudNode" << ii << "_XMLConfiguration" << std::ends;
+            boost::shared_ptr<std::string> xmlName = this->get_string_value(ostreamstr3.str().c_str());
+            gt_cloud_[ii].get<2>() = *xmlName;
+
+            std::ostringstream ostreamstr4;
+            ostreamstr4 << "CloudNode" << ii << "_ComputingPowerIndex" << std::ends;
+            unsigned int computingPowerIndex = this->get_int_value(ostreamstr4.str().c_str());
+            gt_cloud_[ii].get<3>() = computingPowerIndex;
+
+            GADGET_CONDITION_MSG(verboseMode_, "Cloud Node " << ii << " : " << gt_cloud_[ii]);
+        }
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob2DTGadgetCloud::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusReconJob2DTGadgetCloud::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder_, debugFolder_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    if ( !debugFolder2_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder2_, debugFolder2_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder2 is not set ...");
+    }
+
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Pre-allocate memory ... ", performTiming_);
+    mem_manager_->increase(2.0*1024*1024*1024);
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    worker_grappa_.gtPlus_mem_manager_ = mem_manager_;
+    worker_noacceleration_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_L1_ncg_.gtPlus_mem_manager_ = mem_manager_;
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::setWorkOrder2DTParameters(GtPlusRecon2DTPara& para, WorkOrder2DTType* workOrder)
+{
+    workOrder->recon_kspace_needed_ = para.recon_kspace_needed_;
+
+    if ( para.workOrderPara_.coil_compression_thres_>0 || para.workOrderPara_.coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = para.same_coil_compression_coeff_allS_;
+
+    workOrder->embedded_averageall_ref_ = para.embedded_averageall_ref_;
+    workOrder->embedded_ref_numOfModes_ = para.embedded_ref_numOfModes_;
+    workOrder->embedded_fullres_coilmap_ = para.embedded_fullres_coilmap_;
+    workOrder->embedded_fullres_coilmap_useHighestSignal_ = para.embedded_fullres_coilmap_useHighestSignal_;
+    workOrder->embedded_same_combinationcoeff_allS_ = para.embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = para.embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = para.embedded_ref_fillback_;
+
+    workOrder->separate_averageall_ref_ = para.separate_averageall_ref_;
+    workOrder->separate_ref_numOfModes_ = para.separate_ref_numOfModes_;
+    workOrder->separate_fullres_coilmap_ = para.separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = para.separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = para.separate_whichS_combinationcoeff_;
+
+    workOrder->interleaved_same_combinationcoeff_allS_ = para.interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = para.interleaved_whichS_combinationcoeff_;
+    workOrder->interleaved_ref_numOfModes_ = para.interleaved_ref_numOfModes_;
+
+    workOrder->no_acceleration_averageall_ref_ = para.no_acceleration_averageall_ref_;
+    workOrder->no_acceleration_ref_numOfModes_ = para.no_acceleration_ref_numOfModes_;
+    workOrder->no_acceleration_same_combinationcoeff_allS_ = para.no_acceleration_same_combinationcoeff_allS_;
+    workOrder->no_acceleration_whichS_combinationcoeff_ = para.no_acceleration_whichS_combinationcoeff_;
+
+    return true;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud)
+{
+    std::string nodeFileName = ACE_OS::getenv("GADGETRON_HOME");
+    nodeFileName.append("/config/gtCloud/");
+    nodeFileName.append(filename);
+    GADGET_CONDITION_MSG(verboseMode_, "Cloud node file name is " << nodeFileName);
+
+    std::ifstream fs(nodeFileName.c_str(), std::ios::in);
+    if (!fs.is_open()) 
+    {
+        GADGET_WARN_MSG("Cannot open GT CloudNodeFile; use the local setting instead ... ");
+        return false;
+    }
+
+    // control node hostname
+    std::string controlNode;
+    fs >> controlNode;
+
+    std::string portControlNode;
+    fs >> portControlNode;
+
+    // number of GadgetLevel nodes
+    unsigned int num;
+    fs >> num;
+
+    gtCloud.resize(num);
+
+    unsigned int n;
+    for ( n=0; n<num; n++ )
+    {
+        std::string gadgetNode;
+        fs >> gadgetNode;
+
+        std::string portGadgetNode;
+        fs >> portGadgetNode;
+
+        std::string xmlGadgetNode;
+        fs >> xmlGadgetNode;
+
+        unsigned int computingPowerIndex;
+        fs >> computingPowerIndex;
+
+        gtCloud[n].get<0>() = gadgetNode;
+        gtCloud[n].get<1>() = portGadgetNode;
+        gtCloud[n].get<2>() = xmlGadgetNode;
+        gtCloud[n].get<3>() = computingPowerIndex;
+
+        GADGET_CONDITION_MSG(verboseMode_, "Gadget Node " << n << " : " << gt_cloud_[n]);
+    }
+
+    fs.close();
+
+    return true;
+}
+
+int GtPlusReconJob2DTGadgetCloud::process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusRecon2DTCloudPackageCPFL > * m2)
+{
+    // because the parameter configuration will not be sent, we need to call process_config explicitly
+    if ( !process_config_called_ )
+    {
+        GADGET_CHECK_RETURN( (this->process_config(m1)==0), GADGET_FAIL);
+        process_config_called_ = true;
+
+        if ( CloudComputing_ )
+        {
+            bool parseSuccess = this->parseGTCloudNodeFile(cloud_node_file_, gt_cloud_);
+            if ( parseSuccess )
+            {
+                CloudComputing_ = true;
+                CloudSize_ = gt_cloud_.size();
+
+                if ( CloudSize_ == 0 )
+                {
+                    CloudComputing_ = false;
+                    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob2DTGadgetCloud : cannot find algorithm nodes ... ");
+                }
+            }
+        }
+    }
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob2DTGadgetCloud::process(...) starts ... ");
+
+    int* jobID = m1->getObjectPtr();
+    GADGET_CONDITION_MSG(verboseMode_, "--> arriving job : " << *jobID << " ... ");
+
+    GtPlusRecon2DTCloudPackageCPFL* job = m2->getObjectPtr();
+
+    boost::shared_ptr< std::vector<size_t> > dims = job->kspace.get_dimensions();
+
+    GADGET_CONDITION_MSG(verboseMode_, "job array size : [Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << (*dims)[7] << " " << (*dims)[8] << " " << (*dims)[9] << "]");
+
+    GtPlusRecon2DTPara& para = job->para;
+
+    // ---------------------------------------------------------
+    // set the work flow
+    // ---------------------------------------------------------
+    workflow_.reconSizeRO_ = para.reconSizeRO_;
+    workflow_.reconSizeE1_ = para.reconSizeE1_;
+    workflow_.reconSizeE2_ = para.reconSizeE2_;
+    workflow_.encodingFOV_RO_ = para.encodingFOV_RO_;
+    workflow_.encodingFOV_E1_ = para.encodingFOV_E1_;
+    workflow_.encodingFOV_E2_ = para.encodingFOV_E2_;
+    workflow_.reconFOV_RO_ = para.reconFOV_RO_;
+    workflow_.reconFOV_E1_ = para.reconFOV_E1_;
+    workflow_.reconFOV_E2_ = para.reconFOV_E2_;
+
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = para.dim_4th_;
+    workflow_.dim5th_ = para.dim_5th_;
+    workflow_.WorkOrderShareDim_ = para.workOrder_ShareDim_;
+    workflow_.performTiming_ = performTiming_;
+
+    // ---------------------------------------------------------
+    // set work order
+    // ---------------------------------------------------------
+    WorkOrder2DTType workOrder;
+
+    workOrder.copyFromPara(para.workOrderPara_);
+
+    workOrder.job_split_by_S_ = job_split_by_S_;
+    workOrder.job_num_of_N_ = job_num_of_N_;
+    workOrder.job_max_Megabytes_ = job_max_Megabytes_;
+    workOrder.job_overlap_ = job_overlap_;
+
+    workOrder.CloudComputing_ = CloudComputing_;
+    workOrder.CloudSize_ = CloudSize_;
+    workOrder.gt_cloud_ = gt_cloud_;
+
+    workOrder.data_ = job->kspace;
+    workOrder.ref_ = job->ref;
+
+    // ---------------------------------------------------------
+    // set the worker
+    // ---------------------------------------------------------
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( !debugFolder_fullPath_.empty() ) workflow_.debugFolder_ = debugFolder_fullPath_;
+
+    // set the worker
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( verboseMode_ )
+    {
+        workOrder.print(std::cout);
+    }
+
+    // perform the recon
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Recon 2DT workorder on cloud node ... ", performTiming_);
+
+    GADGET_CHECK_RETURN(this->generateKSpaceFilter(workOrder), GADGET_FAIL);
+
+    workOrder.duplicate(workOrder_recon_);
+    setWorkOrder2DTParameters(para, &workOrder_recon_);
+
+    workflow_.workOrder_ = &workOrder_recon_;
+    if ( verboseMode_ )
+    {
+        workflow_.workOrder_->print(std::cout);
+    }
+
+    workflow_.setDataArray(workOrder.data_);
+
+    if ( workOrder.ref_.get_number_of_elements() > 0 )
+    {
+        workflow_.setRefArray(workOrder.ref_);
+    }
+    else if ( para.workOrderPara_.CalibMode_==Gadgetron::gtPlus::ISMRMRD_interleaved )
+    {
+        workOrder.ref_ = workOrder.data_;
+        workflow_.setRefArray(workOrder.ref_);
+    }
+
+    // set the work flow for worker and workOrder
+    if ( workOrder.acceFactorE1_ > 1 )
+    {
+        if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_;
+        }
+        else if ( para.workOrderPara_.recon_algorithm_ == Gadgetron::gtPlus::ISMRMRD_L1SPIRIT )
+        {
+            workflow_.worker_ = &worker_spirit_L1_ncg_;
+        }
+        else
+        {
+            workflow_.worker_ = &worker_grappa_;
+        }
+    }
+    else
+    {
+        workflow_.worker_ = &worker_noacceleration_;
+    }
+
+    bool succeed = true;
+    succeed = workflow_.preProcessing();
+    if ( succeed )
+    {
+        succeed = workflow_.recon();
+        if ( succeed )
+        {
+            succeed = workflow_.postProcessing();
+        }
+    }
+
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "Node_Recon2DT_" << *jobID;
+
+        hoNDArray<GT_Complex8> res = workflow_.res_;
+        res.squeeze();
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, res, ostr.str());
+    }
+
+    // clean the kspace and ker and coil map
+    job->kspace.clear();
+
+    if ( succeed )
+    {
+        job->complexIm = workflow_.res_;
+    }
+    else
+    {
+        job->complexIm.clear();
+        job->res.clear();
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutJob(*jobID, job), GADGET_FAIL);
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob2DTGadgetCloud::process(...) ends ... ");
+
+    // reset the status
+    workflow_.data_ = NULL;
+    workflow_.ref_ = NULL;
+    workflow_.noise_ = NULL;
+    workflow_.workOrder_ = NULL;
+    // Gadgetron::clear(&workflow_.res_);
+
+    m1->release();
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::
+sendOutJob(int jobID, GtPlusRecon2DTCloudPackageCPFL* job)
+{
+    try
+    {
+        ACE_DEBUG( (LM_INFO, ACE_TEXT("GtPlusReconJob2DTGadgetCloud sendOutJob ... ")) );
+
+        if (!this->controller_)
+        {
+            ACE_DEBUG( (LM_DEBUG, ACE_TEXT("Cannot return result to controller, no controller set")) );
+            return false;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_GADGETCLOUD_JOB;
+
+        GadgetContainerMessage<int>* m1 = new GadgetContainerMessage<int>();
+        *(m1->getObjectPtr()) = jobID;
+
+        GadgetContainerMessage<GtPlusRecon2DTCloudPackageCPFL>* m2 = new GadgetContainerMessage<GtPlusRecon2DTCloudPackageCPFL>();
+
+        *(m2->getObjectPtr()) = *job;
+
+        m1->cont(m2);
+        mb->cont(m1);
+
+        int ret =  this->controller_->output_ready(mb);
+        if (ret < 0)
+        {
+            GADGET_DEBUG1("Failed to return GtPlusReconJob2DTGadgetCloud job massage to controller\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob2DTGadgetCloud::sendOutJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::
+    generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath)
+{
+    debugFolderPath = ACE_OS::getenv("GADGETRON_HOME");
+    debugFolderPath.append("/");
+    debugFolderPath.append(debugFolder);
+    debugFolderPath.append("/");
+    GADGET_CONDITION_MSG(verboseMode_, "Debug folder is " << debugFolderPath);
+    return true;
+}
+
+void GtPlusReconJob2DTGadgetCloud::
+    getCurrentMoment(std::string& procTime)
+{
+    char timestamp[100];
+    time_t mytime;
+    struct tm *mytm;
+    mytime=time(NULL);
+    mytm=localtime(&mytime);
+    strftime(timestamp, sizeof(timestamp),"_%a_%d_%b_%Y_%H_%M_%S",mytm);
+    procTime = timestamp;
+}
+
+bool GtPlusReconJob2DTGadgetCloud::
+generateKSpaceFilter(WorkOrderType& workOrder)
+{
+    try
+    {
+        size_t RO = workOrder.data_.get_size(0);
+        size_t E1 = workOrder.data_.get_size(1);
+        size_t E2 = workOrder.data_.get_size(4);
+
+        size_t RO_ref = workOrder.ref_.get_size(0);
+        size_t E1_ref = workOrder.ref_.get_size(1);
+        size_t E2_ref = workOrder.ref_.get_size(4);
+
+        if ( workOrder.CalibMode_ == Gadgetron::gtPlus::ISMRMRD_interleaved )
+        {
+            RO_ref = RO;
+            E1_ref = E1;
+            E2_ref = E2;
+        }
+
+        // image data filter
+        if ( RO>1 && filterRO_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterRO_.create(RO);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(RO, workOrder.filterRO_, filterRO_type_, filterRO_sigma_, std::ceil(filterRO_width_*RO)));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterRO_, "filterRO");
+        }
+
+        if ( E1>1 && filterE1_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterE1_.create(E1);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E1, workOrder.filterE1_, filterE1_type_, filterE1_sigma_, std::ceil(filterE1_width_*E1)));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_, "filterE1");
+        }
+
+        if ( E2>1 && filterE2_type_ != ISMRMRD_FILTER_NONE )
+        {
+            workOrder.filterE2_.create(E2);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(E2, workOrder.filterE2_, filterE2_type_, filterE2_sigma_, std::ceil(filterE2_width_*E2)));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_, "filterE2");
+        }
+
+        // ref data filter
+        if ( workOrder.ref_.get_number_of_elements() > 0 )
+        {
+            size_t startRO(0), endRO(0), startE1(0), endE1(0), startE2(0), endE2(0);
+            if ( E2_ref == 1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion2D(workOrder.ref_, startRO, endRO, startE1, endE1));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.detectSampledRegion3D(workOrder.ref_, startRO, endRO, startE1, endE1, startE2, endE2));
+            }
+
+            if ( (workOrder.CalibMode_ == ISMRMRD_interleaved) || (workOrder.CalibMode_ == ISMRMRD_embedded) )
+            {
+                // use the image data sample range
+                startRO = workOrder.start_RO_; if ( startRO < 0 ) startRO=0;
+                endRO = workOrder.end_RO_; if ( endRO < 0 ) endRO = RO_ref-1;
+            }
+
+            if ( RO_ref > 1 && filterRO_ref_type_ != ISMRMRD_FILTER_NONE )
+            {
+                workOrder.filterRO_ref_.create(RO_ref);
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO_ref, startRO, endRO, workOrder.filterRO_ref_, filterRO_ref_type_, filterRO_ref_sigma_, std::ceil(filterRO_ref_width_*RO_ref)));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterRO_ref_, "filterRO_ref");
+            }
+
+            if ( (workOrder.CalibMode_ == ISMRMRD_separate) || (workOrder.CalibMode_ == ISMRMRD_external) )
+            {
+                if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = endE1-startE1+1;
+                    workOrder.filterE1_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, std::ceil(filterE1_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_ref_, "filterE1_ref");
+                }
+
+                if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = endE2-startE2+1;
+                    workOrder.filterE2_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilter(len, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, std::ceil(filterE2_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_ref_, "filterE2_ref");
+                }
+            }
+            else
+            {
+                // this makes sure for interleaved and embedded, the kspace filter is applied at correct lines
+                if ( E1_ref > 1 && filterE1_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = E1_ref;
+                    workOrder.filterE1_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE1, endE1, workOrder.filterE1_ref_, filterE1_ref_type_, filterE1_ref_sigma_, std::ceil(filterE1_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_ref_, "filterE1_ref");
+                }
+
+                if ( E2_ref > 1 && filterE2_ref_type_ != ISMRMRD_FILTER_NONE )
+                {
+                    size_t len = E2_ref;
+                    workOrder.filterE2_ref_.create(len);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(len, startE2, endE2, workOrder.filterE2_ref_, filterE2_ref_type_, filterE2_ref_sigma_, std::ceil(filterE2_ref_width_*len)));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_ref_, "filterE2_ref");
+                }
+            }
+        }
+
+        // partial fourier handling filter
+        if ( RO>1 && workOrder.start_RO_>=0 && workOrder.end_RO_>0 )
+        {
+            workOrder.filterRO_partialfourier_.create(RO);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder.start_RO_, workOrder.end_RO_, workOrder.filterRO_partialfourier_, filterRO_pf_type_, std::ceil(filterRO_pf_width_*RO), filterRO_pf_densityComp_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterRO_partialfourier_, "filterRO_partialfourier");
+        }
+
+        if ( E1>1 && workOrder.start_E1_>=0 && workOrder.end_E1_>0 )
+        {
+            workOrder.filterE1_partialfourier_.create(E1);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder.start_E1_, workOrder.end_E1_, workOrder.filterE1_partialfourier_, filterE1_pf_type_, std::ceil(filterE1_pf_width_*E1), filterE1_pf_densityComp_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE1_partialfourier_, "filterE1_partialfourier");
+        }
+
+        if ( E2>1 && workOrder.start_E2_>=0 && workOrder.end_E2_>0 )
+        {
+            workOrder.filterE2_partialfourier_.create(E2);
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E2, workOrder.start_E2_, workOrder.end_E2_, workOrder.filterE2_partialfourier_, filterE2_pf_type_, std::ceil(filterE2_pf_width_*E2), filterE2_pf_densityComp_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_fullPath_, gt_exporter_, workOrder.filterE2_partialfourier_, "filterE2_partialfourier");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob2DTGadgetCloud::generateKSpaceFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconJob2DTGadgetCloud)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h
new file mode 100644
index 0000000..59fe9e7
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob2DTGadgetCloud.h
@@ -0,0 +1,195 @@
+/** \file   GtPlusReconJob2DTGadgetCloud.h
+    \brief  This gadget serves as the first layer gadget for the dual layer cloud.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+            Magenetic Resonance in Medicine on Dec 2013.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "hoNDArray_utils.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+#include "gtPlusMemoryManager.h"
+
+#include "GtPlusRecon2DTCloudPackage.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconJob2DTGadgetCloud : public Gadgetron::Gadget2< int, GtPlusRecon2DTCloudPackageCPFL >
+{
+public:
+    GADGET_DECLARE(GtPlusReconJob2DTGadgetCloud);
+
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::Gadget2< int, GtPlusRecon2DTCloudPackageCPFL > BaseClass;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef WorkOrderType WorkOrder2DTType;
+
+    GtPlusReconJob2DTGadgetCloud();
+    ~GtPlusReconJob2DTGadgetCloud();
+
+    // kspace filter
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterRO_type_;
+    double filterRO_sigma_;
+    double filterRO_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE1_type_;
+    double filterE1_sigma_;
+    double filterE1_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE2_type_;
+    double filterE2_sigma_;
+    double filterE2_width_;
+
+    // ref data filter
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterRO_ref_type_;
+    double filterRO_ref_sigma_;
+    double filterRO_ref_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE1_ref_type_;
+    double filterE1_ref_sigma_;
+    double filterE1_ref_width_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE2_ref_type_;
+    double filterE2_ref_sigma_;
+    double filterE2_ref_width_;
+
+    // partial fourier filter
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterRO_pf_type_;
+    double filterRO_pf_sigma_;
+    double filterRO_pf_width_;
+    bool filterRO_pf_densityComp_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE1_pf_type_;
+    double filterE1_pf_sigma_;
+    double filterE1_pf_width_;
+    bool filterE1_pf_densityComp_;
+
+    Gadgetron::gtPlus::ISMRMRDKSPACEFILTER filterE2_pf_type_;
+    double filterE2_pf_sigma_;
+    double filterE2_pf_width_;
+    bool filterE2_pf_densityComp_;
+
+    bool job_split_by_S_;
+    size_t job_num_of_N_;
+    size_t job_max_Megabytes_;
+    size_t job_overlap_;
+
+    /// cloud related definition
+    bool CloudComputing_;
+    unsigned int CloudSize_;
+
+    typedef boost::tuple<std::string, std::string, std::string, unsigned int> CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    CloudType gt_cloud_;
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    // debug folder 2
+    std::string debugFolder2_;
+    std::string debugFolder2_fullPath_;
+
+    // cloud node file
+    std::string cloud_node_file_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // --------------------------------------------------
+    // utility functions
+    // --------------------------------------------------
+
+    // generate the debug folder path
+    // debugFolderPath = ${GADGETRON_HOME}/debugFolder
+    virtual bool generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath);
+
+    // get the current moment
+    void getCurrentMoment(std::string& procTime);
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusRecon2DTCloudPackageCPFL > * m2);
+
+    bool parseGTCloudNodeFile(const std::string& filename, CloudType& gtCloud);
+
+    // process config is only to be called once
+    bool process_config_called_;
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // send the completed job
+    bool sendOutJob(int jobID, GtPlusRecon2DTCloudPackageCPFL* job);
+
+    // set 2DT specific work order parameters
+    bool setWorkOrder2DTParameters(GtPlusRecon2DTPara& para, WorkOrder2DTType* workOrder);
+
+    // compute the kspace filter
+    bool generateKSpaceFilter(WorkOrderType& workOrder);
+
+    // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker2DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    // workOrder for recon
+    WorkOrder2DTType workOrder_recon_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+
+    // memory manager
+    boost::shared_ptr<Gadgetron::gtPlus::gtPlusMemoryManager> mem_manager_;
+};
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp b/gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp
new file mode 100644
index 0000000..3cf5a98
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob3DTGadget.cpp
@@ -0,0 +1,262 @@
+
+#include "GtPlusReconJob3DTGadget.h"
+#include "GtPlusGadgetOpenMP.h"
+
+using namespace Gadgetron::gtPlus;
+
+namespace Gadgetron
+{
+
+GtPlusReconJob3DTGadget::GtPlusReconJob3DTGadget() : mem_manager_(new Gadgetron::gtPlus::gtPlusMemoryManager(4, 640*1024*1024))
+{
+    debugFolder_ = "DebugOutput";
+
+    performTiming_ = true;
+
+    verboseMode_ = false;
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+
+    process_config_called_ = false;
+
+    Gadgetron::prepOpenMP();
+    Gadgetron::prepMKL();
+}
+
+GtPlusReconJob3DTGadget::~GtPlusReconJob3DTGadget()
+{
+
+}
+
+bool GtPlusReconJob3DTGadget::readParameters()
+{
+    try
+    {
+        GADGET_CONDITION_MSG(verboseMode_, "------> GtPlusReconJob3DTGadget parameters <------");
+
+        boost::shared_ptr<std::string> str = this->get_string_value("debugFolder");
+        debugFolder_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder_ is " << debugFolder_);
+
+        str = this->get_string_value("debugFolder2");
+        debugFolder2_ = *str;
+        GADGET_CONDITION_MSG(verboseMode_, "debugFolder2_ is " << debugFolder2_);
+
+        performTiming_ = this->get_bool_value("performTiming");
+        GADGET_CONDITION_MSG(verboseMode_, "performTiming_ is " << performTiming_);
+
+        GADGET_CONDITION_MSG(verboseMode_, "-----------------------------------------------");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob3DTGadget::readParameters() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+int GtPlusReconJob3DTGadget::process_config(ACE_Message_Block* mb)
+{
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3    4   5    6     7  8   9
+
+    verboseMode_ = this->get_bool_value("verboseMode");
+
+    // read in parameters from the xml
+    GADGET_CHECK_RETURN(this->readParameters(), GADGET_FAIL);
+
+    // generate the destination folder
+    if ( !debugFolder_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder_, debugFolder_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder is not set ...");
+    }
+
+    if ( !debugFolder2_.empty() )
+    {
+        GADGET_CHECK_RETURN_FALSE(generateDebugFolderPath(debugFolder2_, debugFolder2_fullPath_));
+    }
+    else
+    {
+        GADGET_MSG("GtPlusRecon, debugFolder2 is not set ...");
+    }
+
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Pre-allocate memory ... ", performTiming_);
+    mem_manager_->increase(6.0*1024*1024*1024);
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    worker_grappa_.gtPlus_mem_manager_ = mem_manager_;
+    worker_noacceleration_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_.gtPlus_mem_manager_ = mem_manager_;
+    worker_spirit_L1_ncg_.gtPlus_mem_manager_ = mem_manager_;
+
+    return GADGET_OK;
+}
+
+int GtPlusReconJob3DTGadget::process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2)
+{
+    // because the parameter configuration will not be sent, we need to call process_config explicitly
+    if ( !process_config_called_ )
+    {
+        GADGET_CHECK_RETURN( (this->process_config(m1)==0), GADGET_FAIL);
+        process_config_called_ = true;
+    }
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob3DTGadget::process(...) starts ... ");
+
+    int* jobID = m1->getObjectPtr();
+    GADGET_CONDITION_MSG(verboseMode_, "--> arriving job : " << *jobID << " ... ");
+
+    GtPlusReconJobTypeCPFL* job = m2->getObjectPtr();
+    GADGET_CONDITION_MSG(verboseMode_, "    job array size : [ " << job->kspace.get_size(0) << " " 
+                                                                 << job->kspace.get_size(1) << " " 
+                                                                 << job->kspace.get_size(2) << " " 
+                                                                 << job->kspace.get_size(3) << " ] ... ");
+
+    // set the worker
+    worker_grappa_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_grappa_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_noacceleration_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_noacceleration_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_.debugFolder_ = debugFolder_fullPath_;
+
+    worker_spirit_L1_ncg_.performTiming_ = performTiming_;
+    if ( !debugFolder_fullPath_.empty() ) worker_spirit_L1_ncg_.debugFolder_ = debugFolder_fullPath_;
+
+    if ( verboseMode_ )
+    {
+        job->workOrder2DT.print(std::cout);
+    }
+
+    bool succeed = true;
+    GADGET_START_TIMING_CONDITION(gt_timer1_, "Recon 2DT job ... ", performTiming_);
+
+    succeed = worker_spirit_L1_ncg_.performUnwarppingImpl(*job);
+
+    GADGET_STOP_TIMING_CONDITION(gt_timer1_, performTiming_);
+
+    // export the results
+    if ( !debugFolder2_fullPath_.empty() )
+    {
+        std::ostringstream ostr;
+        ostr << "ReconJob2DT_ID" << *jobID;
+
+        hoNDArray<GT_Complex8> res = job->res;
+        res.squeeze();
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, res, ostr.str());
+
+        std::ostringstream ostr2;
+        ostr2 << "Job2DT_kspace_ID" << *jobID;
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, job->kspace, ostr2.str());
+
+        std::ostringstream ostr3;
+        ostr3 << "Job2DT_ker_ID" << *jobID;
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, job->ker, ostr3.str());
+
+        if ( job->workOrder2DT.coilMap_->get_number_of_elements() > 0 )
+        {
+            std::ostringstream ostr4;
+            ostr4 << "Job2DT_coilmap_ID" << *jobID;
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder2_fullPath_, gt_exporter_, *job->workOrder2DT.coilMap_, ostr4.str());
+        }
+    }
+
+    // clean the kspace and ker and coil map
+    job->kspace.clear();
+    job->ker.clear();
+    if ( !job->workOrder2DT.coilMap_ ) job->workOrder2DT.coilMap_->clear();
+
+    if ( !succeed )
+    {
+        job->complexIm.clear();
+        job->res.clear();
+    }
+
+    // send out the results
+    GADGET_CHECK_RETURN(this->sendOutJob(*jobID, job), GADGET_FAIL);
+
+    GADGET_CONDITION_MSG(verboseMode_, "GtPlusReconJob3DTGadget::process(...) ends ... ");
+
+    m1->release();
+
+    return GADGET_OK;
+}
+
+bool GtPlusReconJob3DTGadget::
+sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job)
+{
+    try
+    {
+        ACE_DEBUG( (LM_INFO, ACE_TEXT("GtPlusReconJob3DTGadget sendOutJob ... ")) );
+
+        if (!this->controller_)
+        {
+            ACE_DEBUG( (LM_DEBUG, ACE_TEXT("Cannot return result to controller, no controller set")) );
+            return false;
+        }
+
+        GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+            new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        mb->getObjectPtr()->id = GADGET_MESSAGE_CLOUD_JOB;
+
+        GadgetContainerMessage<int>* m1 = new GadgetContainerMessage<int>();
+        *(m1->getObjectPtr()) = jobID;
+
+        GadgetContainerMessage<GtPlusReconJobTypeCPFL>* m2 = new GadgetContainerMessage<GtPlusReconJobTypeCPFL>();
+
+        *(m2->getObjectPtr()) = *job;
+
+        m1->cont(m2);
+        mb->cont(m1);
+
+        int ret =  this->controller_->output_ready(mb);
+        if (ret < 0)
+        {
+            GADGET_DEBUG1("Failed to return GtPlusReconJob3DTGadget job massage to controller\n");
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GtPlusReconJob3DTGadget::sendOutJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool GtPlusReconJob3DTGadget::
+    generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath)
+{
+    debugFolderPath = ACE_OS::getenv("GADGETRON_HOME");
+    debugFolderPath.append("/");
+    debugFolderPath.append(debugFolder);
+    debugFolderPath.append("/");
+    GADGET_CONDITION_MSG(verboseMode_, "Debug folder is " << debugFolderPath);
+    return true;
+}
+
+void GtPlusReconJob3DTGadget::
+    getCurrentMoment(std::string& procTime)
+{
+    char timestamp[100];
+    time_t mytime;
+    struct tm *mytm;
+    mytime=time(NULL);
+    mytm=localtime(&mytime);
+    strftime(timestamp, sizeof(timestamp),"_%a_%d_%b_%Y_%H_%M_%S",mytm);
+    procTime = timestamp;
+}
+
+GADGET_FACTORY_DECLARE(GtPlusReconJob3DTGadget)
+
+}
diff --git a/gadgets/gtPlus/GtPlusReconJob3DTGadget.h b/gadgets/gtPlus/GtPlusReconJob3DTGadget.h
new file mode 100644
index 0000000..947f9f4
--- /dev/null
+++ b/gadgets/gtPlus/GtPlusReconJob3DTGadget.h
@@ -0,0 +1,117 @@
+/** \file   GtPlusReconJob3DTGadget.h
+    \brief  This gadget serves as the working gadget for the single layer cloud for 3DT reconstruction.
+
+            Ref to: 
+
+            Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+            Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+            Magenetic Resonance in Medicine on Dec 2013.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include <complex>
+#include "GtPlusGadgetExport.h"
+#include "Gadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+#include "GadgetronTimer.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusISMRMRDReconWorker3DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker3DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h"
+#include "gtPlusMemoryManager.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron
+{
+
+class EXPORTGTPLUSGADGET GtPlusReconJob3DTGadget : public Gadgetron::Gadget2< int, GtPlusReconJobTypeCPFL >
+{
+public:
+    GADGET_DECLARE(GtPlusReconJob3DTGadget);
+
+    typedef std::complex<float> ValueType;
+
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder<ValueType> WorkOrderType;
+
+    typedef Gadget2< int, GtPlusReconJobTypeCPFL > BaseClass;
+
+    GtPlusReconJob3DTGadget();
+    ~GtPlusReconJob3DTGadget();
+
+    // debug folder
+    std::string debugFolder_;
+    std::string debugFolder_fullPath_;
+
+    std::string debugFolder2_;
+    std::string debugFolder2_fullPath_;
+
+    // whether to perform timing
+    bool performTiming_;
+
+protected:
+
+    // --------------------------------------------------
+    // utility functions
+    // --------------------------------------------------
+
+    // generate the debug folder path
+    // debugFolderPath = ${GADGETRON_HOME}/debugFolder
+    virtual bool generateDebugFolderPath(const std::string& debugFolder, std::string& debugFolderPath);
+
+    // get the current moment
+    void getCurrentMoment(std::string& procTime);
+
+    // --------------------------------------------------
+    // functional functions
+    // --------------------------------------------------
+
+    // default interface function
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage< int >* m1, Gadgetron::GadgetContainerMessage< GtPlusReconJobTypeCPFL > * m2);
+
+    // process config is only to be called once
+    bool process_config_called_;
+
+    // read in parameters
+    virtual bool readParameters();
+
+    // send the completed job
+    bool sendOutJob(int jobID, GtPlusReconJobTypeCPFL* job);
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker3DTGRAPPA<ValueType> worker_grappa_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTNoAcceleration<ValueType> worker_noacceleration_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTSPIRIT<ValueType> worker_spirit_;
+    Gadgetron::gtPlus::gtPlusReconWorker3DTL1SPIRITNCG<ValueType> worker_spirit_L1_ncg_;
+
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    Gadgetron::gtPlus::gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // in verbose mode, more info is printed out
+    bool verboseMode_;
+
+    // memory manager
+    boost::shared_ptr<Gadgetron::gtPlus::gtPlusMemoryManager> mem_manager_;
+};
+
+}
diff --git a/gadgets/matlab/BaseGadget.m b/gadgets/matlab/BaseGadget.m
new file mode 100644
index 0000000..37e836a
--- /dev/null
+++ b/gadgets/matlab/BaseGadget.m
@@ -0,0 +1,73 @@
+classdef BaseGadget < handle
+
+    properties
+
+        Q = [];
+        xml = [];
+
+    end
+
+    methods
+
+        % Constructor
+        function g = BaseGadget()
+        end
+
+        % Init function
+        function init(g, xmlstr)
+            % Convert the xml config string to an IsmrmrdHeader object
+            g.xml = org.ismrm.ismrmrd.XMLString.StringToIsmrmrdHeader(xmlstr);
+            g.emptyQ();
+        end
+
+        % Process function
+        function [Q] = run_process(g, htype, hdr_bytes, data)
+            if (htype == 1)
+                head = ismrmrd.AcquisitionHeader(hdr_bytes);
+            elseif (htype == 2)
+                head = ismrmrd.ImageHeader(hdr_bytes);
+            else
+                error('Uknown header type.');
+            end
+            g.process(head, data);
+            Q = g.Q;
+        end
+
+        % Config function
+        function config(g)
+            fprintf('%s\n',char(org.ismrm.ismrmrd.xmlhdr.XMLString.IsmrmrdHeaderToString(g.xml)));
+        end
+        
+        % Process function
+        function process(g, head, data)
+            g.putQ(head,data);
+        end
+
+        % Q related functions
+        function emptyQ(g)
+           g.Q = [];
+        end
+
+        function putQ(g, head, data)
+            % find the end of the queue
+	        idx = length(g.Q) + 1;
+            % put the type of the header and the bytes for the header on the queue
+            if isa(head, 'ismrmrd.AcquisitionHeader')
+                g.Q(idx).type = int32(1);
+                head.check(); % fix the types
+                g.Q(idx).bytes = head.toBytes();
+            elseif isa(head, 'ismrmrd.ImageHeader')
+                g.Q(idx).type = int32(2);
+                head.check(); % fix the types
+                g.Q(idx).bytes = head.toBytes();
+            else
+                % TODO: do we throw an error here?
+                g.Q(idx).type = int32(0);
+            end
+            % put the data on the queue
+            % make sure the data is single precision
+            g.Q(idx).data = single(data);
+        end
+
+    end
+end
diff --git a/gadgets/matlab/CMakeLists.txt b/gadgets/matlab/CMakeLists.txt
new file mode 100644
index 0000000..d4c7358
--- /dev/null
+++ b/gadgets/matlab/CMakeLists.txt
@@ -0,0 +1,48 @@
+find_package(Ismrmrd REQUIRED)
+
+include_directories(${MATLAB_INCLUDE_DIR})
+
+if (UNIX)
+    if (APPLE)
+        SET(MATLAB_SUFFIX ".mexmaci64")
+    else(APPLE)
+        SET(MATLAB_SUFFIX ".mexglnxa64")
+    endif(APPLE)
+else(UNIX)
+    SET(MATLAB_SUFFIX ".dll")
+endif(UNIX)
+
+add_library(gadgetron_matlab SHARED MatlabGadget.cpp)
+target_link_libraries(
+    gadgetron_matlab
+    cpucore
+    ${MATLAB_LIBRARIES}
+    ${ISMRMRD_LIBRARIES}
+    ${ISMRMRD_XSD_LIBRARIES}
+    optimized ${ACE_LIBRARIES}
+    debug ${ACE_DEBUG_LIBRARY}
+)
+
+if (UNIX)
+    set(JAVA_MATLAB_SERVER_SRC "MatlabCommandServer.java")
+    string(REPLACE "java" "class" JAVA_MATLAB_SERVER_CLASS ${JAVA_MATLAB_SERVER_SRC})
+    set(JAVA_MATLAB_SERVER_CLASS "${CMAKE_CURRENT_BINARY_DIR}/${JAVA_MATLAB_SERVER_CLASS}")
+
+    string(REPLACE ";" ":" MATLAB_UNIX_JARS "${MATLAB_JARS}")
+
+    add_custom_command(
+        OUTPUT ${JAVA_MATLAB_SERVER_CLASS}
+        DEPENDS ${JAVA_MATLAB_SERVER_SRC}
+        COMMAND javac -d ${CMAKE_CURRENT_BINARY_DIR} -cp "${MATLAB_UNIX_JARS}" ${CMAKE_CURRENT_SOURCE_DIR}/${JAVA_MATLAB_SERVER_SRC}
+        COMMENT "Generating Matlab Command Server class" VERBATIM
+    )
+    add_custom_target(matlab_command_server ALL DEPENDS ${JAVA_MATLAB_SERVER_CLASS})
+    install(FILES ${JAVA_MATLAB_SERVER_CLASS} DESTINATION matlab)
+else(UNIX)
+        MESSAGE( "Don't know how to build the Matlab Command Server class on Windows" )
+endif(UNIX)
+
+install(TARGETS gadgetron_matlab DESTINATION lib)
+install(FILES MatlabGadget.h gadgetron_matlab_export.h DESTINATION include)
+install(FILES BaseGadget.m scale.m accumulate_and_recon.m mask_image.m DESTINATION matlab)
+install(FILES matlab.xml DESTINATION config)
diff --git a/gadgets/matlab/MatlabCommandServer.java b/gadgets/matlab/MatlabCommandServer.java
new file mode 100644
index 0000000..0a7bdb9
--- /dev/null
+++ b/gadgets/matlab/MatlabCommandServer.java
@@ -0,0 +1,129 @@
+import java.lang.Thread;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.io.*;
+import com.mathworks.jmi.*;
+
+class MatlabCommandServer extends Thread {
+
+    ServerSocket socket = null;
+    Matlab matlab = null;
+
+    int port;
+
+    boolean stop_signal_received = false;
+    boolean socket_is_open = false;
+
+    public MatlabCommandServer(int port) {
+	try {
+	    this.matlab = new Matlab();
+	} catch (Exception e) {
+	    System.err.println("Failed to create new Matlab");
+            System.err.println(e.getMessage());
+	}
+        this.port = port;
+    }
+
+    public int getLocalPort() {
+        if (!socket_is_open) {
+            System.err.println("Socket isn't open!");
+            return -1;
+        }
+        return this.socket.getLocalPort();
+    }
+
+    private boolean openSocket() {
+        if (socket_is_open) {
+            return false;
+        }
+
+        try {
+            this.socket = new ServerSocket(this.port);
+            this.socket.setSoTimeout(1000); //1000ms time out. We will check if shutdown has occurred every 1000ms.
+        } catch (Exception e) {
+            // Socket creation has failed, we should do something
+            System.err.println("Socket failed to open");
+            System.err.println(e.getMessage());
+            return false;
+        }
+        socket_is_open = true;
+        return true;
+    }
+
+
+    private boolean closeSocket() {
+        if (!socket_is_open) {
+            return false;
+        }
+
+        try {
+            socket.close();
+        } catch (Exception e) {
+            // Socket close has failed, we should do something
+            System.err.println("Socket failed to close");
+            System.err.println(e.getMessage());
+            return false;
+        }
+        socket_is_open = false;
+        return true;
+    }
+
+    private boolean receiveCommand() {
+	try {
+	    Socket sock = socket.accept();
+	    BufferedReader in = new BufferedReader(new InputStreamReader(sock.getInputStream()));
+
+	    //System.out.println("Waiting for command");
+	    while (!in.ready()) ;
+
+	    String command = in.readLine();
+
+	    //System.out.println(command);
+	    matlab.evalConsoleOutput(command);
+
+	    in.close();
+	    sock.close();
+
+
+	} catch (java.io.InterruptedIOException e) {
+             // This means that we have waited for connection but so far nothing.
+             // We should check if the thread has been notified to stop,
+             // if so, stop the loop and otherwise continue.
+	    if (stop_signal_received) {
+		return false;
+	    }
+	} catch (Exception e) {
+	    System.err.println("Something unexpected has happened!!");
+	    System.err.println(e.getMessage());
+	    return false;
+	}
+	return true;
+
+    }
+
+    public void notifyEnd() {
+	stop_signal_received = true;
+    }
+
+    public void run() {
+	if (!openSocket()) {
+            return;
+        }
+
+        System.err.format("Matlab Command Server is running on port %d%n", this.getLocalPort());
+
+	while (true) {
+	    if (!receiveCommand()) break;
+	}
+	closeSocket();
+	stop_signal_received = false;
+    }
+
+    protected void finalize() throws Throwable {
+	System.out.println("MatlabMessageServer finalize() called");
+	stop_signal_received  = true;
+	closeSocket();
+	super.finalize();
+    }
+
+}
diff --git a/gadgets/matlab/MatlabGadget.cpp b/gadgets/matlab/MatlabGadget.cpp
new file mode 100644
index 0000000..1f23329
--- /dev/null
+++ b/gadgets/matlab/MatlabGadget.cpp
@@ -0,0 +1,288 @@
+#include "MatlabGadget.h"
+
+namespace Gadgetron{
+
+int AcquisitionMatlabGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // Initialize a string for matlab commands
+    std::string cmd;
+
+    ISMRMRD::AcquisitionHeader *acq = m1->getObjectPtr();
+
+    mwSize acq_hdr_dims[2] = {sizeof(ISMRMRD::AcquisitionHeader), 1};
+    mxArray *acq_hdr_bytes = mxCreateNumericArray(2, acq_hdr_dims, mxUINT8_CLASS, mxREAL);
+    memcpy(mxGetData(acq_hdr_bytes), acq, sizeof(ISMRMRD::AcquisitionHeader));
+
+    // Copy the data
+    std::complex<float> *raw_data = m2->getObjectPtr()->get_data_ptr();
+    if (!raw_data) {
+        GADGET_DEBUG1("Broken raw_data pointer\n");
+        return GADGET_FAIL;
+    }
+
+    unsigned long num_elements = m2->getObjectPtr()->get_number_of_elements();
+
+    float *real_data = (float *)mxCalloc(num_elements, sizeof(float));
+    if (!real_data) {
+        GADGET_DEBUG1("Failed to allocate float* for real_data\n");
+        return GADGET_FAIL;
+    }
+    float *imag_data = (float *)mxCalloc(num_elements, sizeof(float));
+    if (!imag_data) {
+        GADGET_DEBUG1("Failed to allocate float* for imag_data\n");
+        return GADGET_FAIL;
+    }
+
+    for (int i = 0; i < num_elements; i++) {
+        //std::cout << i << ": " << raw_data[i].real() << ", " << raw_data[i].imag() << endl;
+        real_data[i] = raw_data[i].real();
+        imag_data[i] = raw_data[i].imag();
+    }
+
+    mxArray *acq_data = mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX);
+    mxSetData(acq_data, real_data);
+    mxSetImagData(acq_data, imag_data);
+    mxSetM(acq_data, m1->getObjectPtr()->number_of_samples);
+    mxSetN(acq_data, m1->getObjectPtr()->active_channels);
+
+    // Logic:
+    // send AcquisitionHeader as a byte array
+    // send AcquisitionData as a complex float array
+    // Call the run_process function in the BaseGadget
+    // Empty the gadget's queue.
+    // This puts a copy of the queue on the workspace.
+    // The queue is a structure array and we read it back
+    // TODO put this in a readme file somewhere useful
+    engPutVariable(engine_, "hdr_bytes", acq_hdr_bytes);
+    engPutVariable(engine_, "data", acq_data);
+    cmd = "Q = matgadget.run_process(1, hdr_bytes, data); matgadget.emptyQ();";
+    send_matlab_command(cmd);
+
+    // Get the size of the gadget's queue
+    mxArray *Q = engGetVariable(engine_, "Q");
+    if (Q == NULL) {
+        GADGET_DEBUG1("Failed to get the Queue from matgadget\n");
+        return GADGET_FAIL;
+    }
+    size_t qlen = mxGetNumberOfElements(Q);
+    //GADGET_DEBUG2("Queue size: %ld", qlen);
+
+    // Loop over the elements of the Q, reading one entry at a time
+    // to get a structure with type, headerbytes, and data
+    mwIndex idx;
+    for (idx = 0; idx < qlen; idx++) {
+        mxArray *res_type = mxGetField(Q, idx, "type");
+        mxArray *res_hdr  = mxGetField(Q, idx, "bytes");
+        mxArray *res_data = mxGetField(Q, idx, "data");
+
+        // determine the type of the object on the quue (i.e. acquisition or image)
+        int tp = *((int *)mxGetData(res_type));
+        switch (tp) {
+        case 1:     // AcquisitionHeader
+        {
+            // grab the modified AcquisitionHeader and convert it back to C++
+            GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m3 =
+                    new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+            ISMRMRD::AcquisitionHeader *hdr_new = m3->getObjectPtr();
+            memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::AcquisitionHeader));
+
+            size_t number_of_samples = mxGetM(res_data);
+            size_t active_channels = mxGetN(res_data);
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m4 =
+                    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m3->cont(m4);
+            std::vector<size_t> dims;
+            dims.push_back(number_of_samples);
+            dims.push_back(active_channels);
+            try {
+                m4->getObjectPtr()->create(&dims);
+            } catch (std::bad_alloc& err) {
+                GADGET_DEBUG1("Failed to create new hoNDArray\n");
+                return GADGET_FAIL;
+            }
+
+            float *real_data = (float *)mxGetData(res_data);
+            float *imag_data = (float *)mxGetImagData(res_data);
+            for (int i = 0; i < number_of_samples*active_channels; i++) {
+                m4->getObjectPtr()->get_data_ptr()[i] = std::complex<float>(real_data[i],imag_data[i]);
+            }
+
+            if (this->next()->putq(m3) < 0) {
+                GADGET_DEBUG1("Failed to put Acquisition message on queue\n");
+                return GADGET_FAIL;
+            }
+
+            break;
+        }
+        case 2:     // ImageHeader
+        {
+            // grab the modified AcquisitionHeader and convert it back to C++
+            GadgetContainerMessage<ISMRMRD::ImageHeader>* m3 =
+                    new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+            ISMRMRD::ImageHeader *hdr_new = m3->getObjectPtr();
+            memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::ImageHeader));
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m4 =
+                    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m3->cont(m4);
+            std::vector<size_t> dims;
+            dims.push_back(hdr_new->matrix_size[0]);
+            dims.push_back(hdr_new->matrix_size[1]);
+            dims.push_back(hdr_new->matrix_size[2]);
+            dims.push_back(hdr_new->channels);
+            try {
+                m4->getObjectPtr()->create(&dims);
+            } catch (std::bad_alloc& err) {
+                GADGET_DEBUG1("Failed to create new hoNDArray\n");
+                return GADGET_FAIL;
+            }
+
+            float *real_data = (float *)mxGetData(res_data);
+            float *imag_data = (float *)mxGetImagData(res_data);
+            for (int i = 0; i < m4->getObjectPtr()->get_number_of_elements(); i++) {
+                m4->getObjectPtr()->get_data_ptr()[i] = std::complex<float>(real_data[i],imag_data[i]);
+            }
+
+            if (this->next()->putq(m3) < 0) {
+                GADGET_DEBUG1("Failed to put Image message on queue\n");
+                return GADGET_FAIL;
+            }
+
+            break;
+        }
+        default:
+            GADGET_DEBUG1("Matlab gadget returned undefined header type\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // Match all mxCreate___s with mxDestroy___s
+    mxDestroyArray(acq_hdr_bytes);
+    mxDestroyArray(acq_data);
+
+    return GADGET_OK;
+}
+
+
+// TODO: The ImageMatlabGadget is not currently templated
+//      It only works for images of type std::complex<float>
+int ImageMatlabGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+    // Initialize a string for matlab commands
+    std::string cmd;
+
+    ISMRMRD::ImageHeader *img = m1->getObjectPtr();
+
+    // Create a mxArray of bytes for the ISMRMRD::ImageHeader
+    mwSize img_hdr_dims[2] = {sizeof(ISMRMRD::ImageHeader), 1};
+    mxArray *img_hdr_bytes = mxCreateNumericArray(2, img_hdr_dims, mxUINT8_CLASS, mxREAL);
+    memcpy(mxGetData(img_hdr_bytes), img, sizeof(ISMRMRD::ImageHeader));
+
+    // Create a mxArray for the Image data
+    std::complex<float> *raw_data = m2->getObjectPtr()->get_data_ptr();
+    if (!raw_data) {
+        GADGET_DEBUG1("Broken raw_data pointer\n");
+        return GADGET_FAIL;
+    }
+
+    if (img->matrix_size[0] == 0) img->matrix_size[0] = 1;
+    if (img->matrix_size[1] == 0) img->matrix_size[1] = 1;
+    if (img->matrix_size[2] == 0) img->matrix_size[2] = 1;
+    if (img->channels == 0) img->channels = 1;
+
+    mwSize ndim = 4;
+    mwSize dims[4] = {img->matrix_size[0], img->matrix_size[1], img->matrix_size[2], img->channels};
+    mxArray *img_data = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxCOMPLEX);
+
+    float *real_data = (float *)mxGetData(img_data);
+    float *imag_data = (float *)mxGetImagData(img_data);
+    unsigned long num_elements = m2->getObjectPtr()->get_number_of_elements();
+    for (int i = 0; i < num_elements; i++) {
+        real_data[i] = raw_data[i].real();
+        imag_data[i] = raw_data[i].imag();
+    }
+
+    engPutVariable(engine_, "hdr_bytes", img_hdr_bytes);
+    engPutVariable(engine_, "data", img_data);
+    cmd = "Q = matgadget.run_process(2, hdr_bytes, data); matgadget.emptyQ();";
+    send_matlab_command(cmd);
+
+    // Get the size of the gadget's queue
+    mxArray *Q = engGetVariable(engine_, "Q");
+    if (Q == NULL) {
+        GADGET_DEBUG1("Failed to get the Queue from matgadget\n");
+        return GADGET_FAIL;
+    }
+    size_t qlen = mxGetNumberOfElements(Q);
+
+    // Loop over the elements of the Q, reading one entry at a time
+    // to get a structure with type, headerbytes, and data
+    mwIndex idx;
+    for (idx = 0; idx < qlen; idx++) {
+        mxArray *res_type = mxGetField(Q, idx, "type");
+        mxArray *res_hdr  = mxGetField(Q, idx, "bytes");
+        mxArray *res_data = mxGetField(Q, idx, "data");
+
+        // determine the type of the object on the queue (i.e. acquisition or image)
+        // although, since this is an Image gadget, it better be an image
+        int tp = *((int *)mxGetData(res_type));
+        switch (tp) {
+        case 2:     // ImageHeader
+        {
+            // grab the modified AcquisitionHeader and convert it back to C++
+            GadgetContainerMessage<ISMRMRD::ImageHeader>* m3 =
+                    new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+            ISMRMRD::ImageHeader *hdr_new = m3->getObjectPtr();
+            memcpy(hdr_new, mxGetData(res_hdr), sizeof(ISMRMRD::ImageHeader));
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m4 =
+                    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m3->cont(m4);
+            std::vector<size_t> dims;
+            dims.push_back(hdr_new->matrix_size[0]);
+            dims.push_back(hdr_new->matrix_size[1]);
+            dims.push_back(hdr_new->matrix_size[2]);
+            dims.push_back(hdr_new->channels);
+            try {
+                m4->getObjectPtr()->create(&dims);
+            } catch (std::bad_alloc& err) {
+                GADGET_DEBUG1("Failed to create new hoNDArray\n");
+                return GADGET_FAIL;
+            }
+
+            float *real_data = (float *)mxGetData(res_data);
+            float *imag_data = (float *)mxGetImagData(res_data);
+            for (int i = 0; i < m4->getObjectPtr()->get_number_of_elements(); i++) {
+                m4->getObjectPtr()->get_data_ptr()[i] = std::complex<float>(real_data[i],imag_data[i]);
+            }
+
+            if (this->next()->putq(m3) < 0) {
+                GADGET_DEBUG1("Failed to put Image message on queue\n");
+                return GADGET_FAIL;
+            }
+
+            break;
+        }
+        default:
+            GADGET_DEBUG1("Matlab gadget returned undefined header type\n");
+            return GADGET_FAIL;
+        }
+    }
+
+    // Match all mxCreate___s with mxDestroy___s
+    mxDestroyArray(img_hdr_bytes);
+    mxDestroyArray(img_data);
+
+    return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(AcquisitionMatlabGadget)
+GADGET_FACTORY_DECLARE(ImageMatlabGadget)
+}
diff --git a/gadgets/matlab/MatlabGadget.h b/gadgets/matlab/MatlabGadget.h
new file mode 100644
index 0000000..fb461d7
--- /dev/null
+++ b/gadgets/matlab/MatlabGadget.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include "gadgetron_matlab_export.h"
+#include "Gadget.h"
+#include "Gadgetron.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "engine.h"     // Matlab Engine header
+
+#include "ace/Synch.h"  // For the MatlabCommandServer
+#include "ace/SOCK_Connector.h"
+#include "ace/INET_Addr.h"
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <complex>
+#include <boost/lexical_cast.hpp>
+
+// TODO:
+//Make the port option work so that we can have multiple matlabs running, each with its own command server.
+//Create a debug option to use evalstring and get back the matlab output on every function call.
+//Finish the image stuff
+//Is there a better way to kill the command server?
+//Test on windows
+
+
+namespace Gadgetron{
+
+template <class T> class MatlabGadget :
+    public Gadget2<T, hoNDArray< std::complex<float> > >
+{
+public:
+    MatlabGadget(): Gadget2<T, hoNDArray< std::complex<float> > >()
+    {
+        // Open the Matlab Engine on the current host
+        GADGET_DEBUG1("Starting MATLAB engine\n");
+        if (!(engine_ = engOpen("matlab -nosplash -nodesktop"))) {
+            // TODO: error checking!
+            GADGET_DEBUG1("Can't start MATLAB engine\n");
+        } else {
+            // Add ISMRMRD Java bindings jar to Matlab's path
+            // TODO: this should be in user's Matlab path NOT HERE
+
+            // Prepare a buffer for collecting Matlab's output
+            char matlab_buffer_[2049] = "\0";
+            engOutputBuffer(engine_, matlab_buffer_, 2048);
+
+	    // Add the necessary paths to the matlab environment
+	    // Java matlab command server
+            engEvalString(engine_, "javaaddpath(fullfile(getenv('GADGETRON_HOME'), 'matlab'));");
+            // Gadgetron matlab scripts
+            engEvalString(engine_, "addpath(fullfile(getenv('GADGETRON_HOME'), 'matlab'));");
+            // ISMRMRD matlab library
+            engEvalString(engine_, "addpath(fullfile(getenv('ISMRMRD_HOME'), 'matlab'));");
+
+            // Call the ISMRMRD utility function for setting the Java path for the XML header
+            engEvalString(engine_, "ismrmrd.util.includejar;");
+
+	    GADGET_DEBUG2("%s", matlab_buffer_);
+        }
+    }
+
+    ~MatlabGadget()
+    {
+        char matlab_buffer_[2049] = "\0";
+        engOutputBuffer(engine_, matlab_buffer_, 2048);
+	// Stop the Java Command server
+        // send the stop signal to the command server and
+        //  wait a bit for it to shut down cleanly.
+        GADGET_DEBUG1("Closing down the Matlab Command Server\n");
+	engEvalString(engine_, "M.notifyEnd(); pause(1);");
+        engEvalString(engine_, "clear java;");
+        GADGET_DEBUG2("%s", matlab_buffer_);
+        // Close the Matlab engine
+        GADGET_DEBUG1("Closing down Matlab\n");
+        engClose(engine_);
+    }
+
+protected:
+
+    int process_config(ACE_Message_Block* mb)
+    {
+        std::string cmd;
+
+        debug_mode_  = this->get_int_value("debug_mode");
+        path_        = this->get_string_value("matlab_path");
+        classname_   = this->get_string_value("matlab_classname");
+        command_server_port_ = this->get_int_value("matlab_port");
+
+        GADGET_DEBUG2("MATLAB Class Name : %s\n", classname_.get()->c_str());
+
+        char matlab_buffer_[2049] = "\0";
+        engOutputBuffer(engine_, matlab_buffer_, 2048);
+
+   	// Instantiate the Java Command server
+        // TODO: we HAVE to pause in Matlab to allow the java command server thread to start
+        cmd = "M = MatlabCommandServer(" + boost::lexical_cast<std::string>(command_server_port_) +
+                "); M.start(); pause(1);";
+	engEvalString(engine_, cmd.c_str());
+        GADGET_DEBUG2("%s", matlab_buffer_);
+
+        // add user specified path for this gadget
+        if (!path_->empty()) {
+            cmd = "addpath(" + *path_ + ");";
+            send_matlab_command(cmd);
+        }
+
+        // Put the XML Header into the matlab workspace
+        std::string xmlConfig = std::string(mb->rd_ptr());
+        mxArray *xmlstring = mxCreateString(xmlConfig.c_str());
+        engPutVariable(engine_, "xmlstring", xmlstring);
+
+        // Instantiate the Matlab gadget object from the user specified class
+        // Call matlab gadget's init method with the XML Header
+        // and the user defined config method
+        cmd = "matgadget = " + *classname_ + "();";
+        cmd += "matgadget.init(xmlstring); matgadget.config();";
+        if (send_matlab_command(cmd) != GADGET_OK) {
+            GADGET_DEBUG1("Failed to send matlab command.\n");
+            return GADGET_FAIL;
+        }
+
+	mxDestroyArray(xmlstring);
+
+        return GADGET_OK;
+    }
+
+    int send_matlab_command(std::string& command)
+    {
+
+        if (debug_mode_) {
+            char matlab_buffer_[2049] = "\0";
+            engOutputBuffer(engine_, matlab_buffer_, 2048);
+            engEvalString(engine_, command.c_str());
+            GADGET_DEBUG2("%s\n", matlab_buffer_);
+            return GADGET_OK;
+        }
+        else {
+            ACE_SOCK_Stream client_stream;
+            ACE_INET_Addr remote_addr(command_server_port_, "localhost");
+            ACE_SOCK_Connector connector;
+
+            if (connector.connect(client_stream, remote_addr) == -1) {
+                GADGET_DEBUG1("Connection failed\n");
+                return GADGET_FAIL;
+            }
+
+            ACE_Time_Value timeout(10);
+            if (client_stream.send_n(command.c_str(), command.size(), &timeout) == -1) {
+                GADGET_DEBUG1("Error in send_n\n");
+                client_stream.close();
+                return GADGET_FAIL;
+            }
+
+            if (client_stream.close () == -1){
+                GADGET_DEBUG1("Error in close\n");
+                return GADGET_FAIL;
+            }
+            return GADGET_OK;
+        }
+    }
+
+
+    boost::shared_ptr<std::string> path_;
+    boost::shared_ptr<std::string> classname_;
+    int command_server_port_;
+    int debug_mode_;
+
+    Engine *engine_;
+};
+
+
+
+class EXPORTGADGETSMATLAB AcquisitionMatlabGadget :
+    public MatlabGadget<ISMRMRD::AcquisitionHeader>
+{
+    public:
+        GADGET_DECLARE(AcquisitionMatlabGadget);
+
+        int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};
+
+class EXPORTGADGETSMATLAB ImageMatlabGadget :
+    public MatlabGadget<ISMRMRD::ImageHeader>
+{
+    public:
+        GADGET_DECLARE(ImageMatlabGadget);
+
+        int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+                GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};
+}
diff --git a/gadgets/matlab/accumulate_and_recon.m b/gadgets/matlab/accumulate_and_recon.m
new file mode 100644
index 0000000..50a6e7c
--- /dev/null
+++ b/gadgets/matlab/accumulate_and_recon.m
@@ -0,0 +1,88 @@
+classdef accumulate_and_recon < handle & BaseGadget
+
+    properties
+
+        image_num;
+        series_num;
+        center_line;
+        accumulation;
+        
+    end
+
+    methods
+
+        function g = config(g)
+            fprintf('The resonance frequency is %d\n', g.xml.getExperimentalConditions().getH1ResonanceFrequencyHz());
+            nx = g.xml.getEncoding().get(0).getEncodedSpace().getMatrixSize().getX();
+            ny = g.xml.getEncoding().get(0).getEncodedSpace().getMatrixSize().getY();
+            % for 2D sequences the number of getZ breaks
+            try
+              nz = g.xml.getEncoding().get(0).getEncodedSpace().getMatrixSize().getZ();
+            catch
+	      nz =1;
+            end
+            % the number of receiver channels is optional
+            try
+                % this is the only cast from java.lang.Integer that works in Matlab
+                nc = double(g.xml.getAcquisitionSystemInformation().getReceiverChannels());
+            catch
+	        nc = 1;
+            end
+            % the number of slices is optional
+            try
+                ns = g.xml.getEncoding().get(0).getEncodingLimits().getSlice().getMaximum() + 1;
+            catch
+	        ns = 1;
+            end
+
+            g.center_line = g.xml.getEncoding().get(0).getEncodingLimits().getKspaceEncodingStep1().getCenter();
+            g.accumulation = zeros(nx, ny, nz, ns, nc);
+            g.image_num = 0;   % todo this needs to be static or global...
+            g.series_num = 0;  % todo this needs to be static or global...
+        end
+
+        function g = process(g, head, data)
+            % stuff the line
+            line_offset = floor(size(g.accumulation,2)/2) - g.center_line;
+            kyind = head.idx.kspace_encode_step_1 + line_offset + 1;
+            kzind = head.idx.kspace_encode_step_2 + 1;
+            slind = head.idx.slice + 1;
+            %fprintf('  offset = %d, center = %d, index = %d\n', line_offset, g.center_line, kyind);
+
+            g.accumulation(:, kyind, kzind, slind, :) = data;
+
+            % At the end of the acquisition, reconstruct the slice
+            if (head.flagIsSet(head.FLAGS.ACQ_LAST_IN_SLICE))
+                img_head = ismrmrd.ImageHeader;
+                img_head.channels = head.active_channels;
+                img_head.slice = head.idx.slice;
+                % set the matrix size
+                % set one element at a time to not break the type (uint16) of matrix_size
+     	        img_head.matrix_size(1) = size(g.accumulation,1); % nx
+     	        img_head.matrix_size(2) = size(g.accumulation,2); % ny
+     	        img_head.matrix_size(3) = size(g.accumulation,3); % nz
+
+                img_head.position = head.position;
+                img_head.read_dir = head.read_dir;
+                img_head.phase_dir = head.phase_dir;
+                img_head.slice_dir = head.slice_dir;
+                img_head.patient_table_position = head.patient_table_position;
+                img_head.acquisition_time_stamp = head.acquisition_time_stamp;
+                img_head.image_index = g.image_num;
+                img_head.image_series_index = g.series_num;
+
+		img_data = squeeze(g.accumulation(:,:,:,slind,:));
+                img_data = fftshift(ifftn(fftshift(img_data)));
+                imagesc(abs(img_data(:,:,1,1))); axis image; axis square;
+		pause(2)
+                close()
+
+                g.putQ(img_head, img_data);
+                %fprintf('Put on Queue %d, type = %d\n',length(g.Q),g.Q{1}.type);
+
+            end
+
+        end
+
+    end
+end
diff --git a/gadgets/matlab/gadgetron_matlab_export.h b/gadgets/matlab/gadgetron_matlab_export.h
new file mode 100644
index 0000000..7cf5c26
--- /dev/null
+++ b/gadgets/matlab/gadgetron_matlab_export.h
@@ -0,0 +1,23 @@
+/*
+ * gadgetroncore_export.h
+ *
+ *  Created on: Jan 28, 2013
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef GADGETRONMATLAB_EXPORT_H_
+#define GADGETRONMATLAB_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_MATLAB__) || defined (gadgetron_matlab_EXPORTS)
+#define EXPORTGADGETSMATLAB __declspec(dllexport)
+#else
+#define EXPORTGADGETSMATLAB __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSMATLAB
+#endif
+
+
+#endif /* GADGETRONMATLAB_EXPORT_H_ */
diff --git a/gadgets/matlab/mask_image.m b/gadgets/matlab/mask_image.m
new file mode 100644
index 0000000..b78b6b5
--- /dev/null
+++ b/gadgets/matlab/mask_image.m
@@ -0,0 +1,27 @@
+classdef mask_image < handle & BaseGadget
+
+    properties
+    end
+
+    methods
+
+        function g = config(g)
+        end
+
+        function g = process(g, head, data)
+            % put the original data on the Q
+            g.putQ(head, data);
+
+            % modify the series number
+            head.image_series_index = head.image_series_index + 1;
+
+            % zero out a corner of the image
+            data(1:end/2,1:end/2,:) = 0;
+            
+            % put the modified header and image on the Q
+            g.putQ(head,data);
+
+        end
+
+    end
+end
diff --git a/gadgets/matlab/matlab.xml b/gadgets/matlab/matlab.xml
new file mode 100644
index 0000000..4f48f5f
--- /dev/null
+++ b/gadgets/matlab/matlab.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>MatlabAcquisition</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>AcquisitionMatlabGadget</classname>
+    <property><name>debug_mode</name><value>0</value></property>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>scale</value></property>
+    <property><name>matlab_port</name><value>3000</value></property>
+  </gadget>
+
+  <gadget>
+    <name>MatlabAcquisition</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>AcquisitionMatlabGadget</classname>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>accumulate_and_recon</value></property>
+    <property><name>matlab_port</name><value>3001</value></property>
+  </gadget>
+
+  <gadget>
+    <name>CropCombine</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CropAndCombineGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>MatlabImage</name>
+    <dll>gadgetron_matlab</dll>
+    <classname>ImageMatlabGadget</classname>
+    <property><name>matlab_path</name><value></value></property>
+    <property><name>matlab_classname</name><value>mask_image</value></property>
+    <property><name>matlab_port</name><value>3002</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  <gadget>
+    <name>ImageFinishFLOAT</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetFLOAT</classname>
+  </gadget>
+</gadgetronStreamConfiguration>
diff --git a/gadgets/matlab/scale.m b/gadgets/matlab/scale.m
new file mode 100644
index 0000000..db279f7
--- /dev/null
+++ b/gadgets/matlab/scale.m
@@ -0,0 +1,22 @@
+classdef scale < BaseGadget
+
+    properties
+        factor;
+    end
+
+    methods
+
+        function config(g)
+            g.factor = 2;
+        end
+
+        function process(g, head, data)
+    	    fprintf('Processing line = %d\n', head.idx.kspace_encode_step_1);
+            reshdr = head;
+            reshdr.version = 99;
+            resdata = g.factor * data;
+            g.putQ(reshdr, resdata);
+        end
+
+    end
+end
diff --git a/gadgets/moco/CMakeLists.txt b/gadgets/moco/CMakeLists.txt
new file mode 100644
index 0000000..16b5033
--- /dev/null
+++ b/gadgets/moco/CMakeLists.txt
@@ -0,0 +1,76 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_MOCO__)
+endif (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+set( CPU_REG 0)
+set( GPU_REG 0)
+
+if(ARMADILLO_FOUND)
+  if(ARMADILLO_VERSION_STRING VERSION_GREATER "3.819" )
+    
+    set(CPU_REG 1)
+    list(APPEND CPU_GADGETS cpuRegistrationAveragingGadget.cpp)
+    list(APPEND CPU_LIBS cpucore_math cpureg)
+    
+  elseif(ARMADILLO_VERSION_STRING VERSION_GREATER "3.819" )
+    MESSAGE("Armadillo of at least version 3.820 not found, not compiling cpu-based registration gadgets")
+  endif(ARMADILLO_VERSION_STRING VERSION_GREATER "3.819" )
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, not compiling cpu-based registration gadgets")
+endif (ARMADILLO_FOUND)
+
+if(CUDA_FOUND)
+  set(GPU_REG 1)
+  list(APPEND GPU_GADGETS gpuRegistrationAveragingGadget.cpp gpuRegistrationScatteringGadget.cpp)
+  list(APPEND GPU_LIBS gpucore gpureg ${CUDA_LIBRARIES})
+elseif (CUDA_FOUND)
+  MESSAGE("Cuda not found, not compiling gpu-based registration gadgets")
+endif (CUDA_FOUND)
+
+include_directories(   
+  ${ACE_INCLUDE_DIR} 
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/moco
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+)
+
+if(CPU_REG)
+  include_directories(   
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/cpu
+    ${ARMADILLO_INCLUDE_DIRS}
+    )
+endif(CPU_REG)
+
+if(GPU_REG)
+  include_directories(   
+    ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+    ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow/gpu
+    ${CUDA_INCLUDE_DIRS}
+    )
+endif(GPU_REG)
+
+add_library(gadgetron_moco SHARED
+  ${CPU_GADGETS}
+  ${GPU_GADGETS}
+  ${ISMRMRD_XSD_SOURCE}
+  )
+
+target_link_libraries(gadgetron_moco
+  cpucore gadgetron_mricore ${CPU_LIBS} ${GPU_LIBS}
+  ${Boost_LIBRARIES} ${ISMRMRD_LIBRARIES} ${XERCESC_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+  )
+
+install (TARGETS gadgetron_moco DESTINATION lib)
+
+add_subdirectory(config)
diff --git a/gadgets/moco/RegistrationAveragingGadget.h b/gadgets/moco/RegistrationAveragingGadget.h
new file mode 100644
index 0000000..c5d9b23
--- /dev/null
+++ b/gadgets/moco/RegistrationAveragingGadget.h
@@ -0,0 +1,328 @@
+#ifndef RegistrationAveragingGadget_H
+#define RegistrationAveragingGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "complext.h"
+#include "PhysioInterpolationGadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetronTimer.h"
+#include "gadgetron_moco_export.h"
+#include "hoNDArray_fileio.h"
+
+#ifdef USE_CUDA
+#include "cuNDArray_reductions.h"
+#endif // USE_CUDA
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{  
+
+  template<class ARRAY_TYPE, unsigned int D> class opticalFlowSolver;
+  
+  /**
+     This is an abstract gadget class and consequently should not be included in any xml configuration file.
+     "Instatiate" instead the cpuRegistrationAveragingGadget or gpuRegistrationAveragingGadget.
+  */
+  template<class ARRAY_TYPE, unsigned int D> class EXPORTGADGETS_MOCO RegistrationAveragingGadget 
+    : public Gadget2<ISMRMRD::ImageHeader, hoNDArray< typename ARRAY_TYPE::element_type > > // se note below
+  {
+    //
+    // We use hoNDArray to interface the gadget chain, even if ARRAY_TYPE is a cuNDArray
+    // Instead of hard coding the interface to use single precision (float), 
+    // "typename ARRAY_TYPE::element_type" could in principle denote a double precison type (double) as well.
+    // Registration of complex images is however not supported currently...
+    //
+    
+  public:
+    
+    RegistrationAveragingGadget() {
+      this->of_solver_ = 0x0;
+      this->number_of_phases_ = 0; // This is a property queried from the PhysioInterpolationGadget
+      this->set_parameter(std::string("alpha").c_str(), "0.05");
+      this->set_parameter(std::string("beta").c_str(), "1.0");
+      this->set_parameter(std::string("limit").c_str(), "0.01");
+      this->set_parameter(std::string("num_multiresolution_levels").c_str(), "3");
+      this->set_parameter(std::string("max_iterations_per_level").c_str(), "500");    
+      this->set_parameter(std::string("output_convergence").c_str(), "false");
+    }
+
+    virtual ~RegistrationAveragingGadget() {
+      if( this->of_solver_ ) delete this->of_solver_;
+    }
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block *mb)
+    {
+      this->alpha_ = (typename ARRAY_TYPE::element_type)this->get_double_value("alpha");
+      this->beta_  = (typename ARRAY_TYPE::element_type)this->get_double_value("beta");
+      this->limit_ = (typename ARRAY_TYPE::element_type)this->get_double_value("limit");
+      this->output_convergence_ = this->get_bool_value(std::string("output_convergence").c_str());
+      this->num_multires_levels_ = this->get_int_value(std::string("num_multiresolution_levels").c_str());
+      this->max_iterations_per_level_ = this->get_int_value(std::string("max_iterations_per_level").c_str());
+      
+      // Fow now we require the existence of a gadget named "PhysioInterpolationGadget" upstream,
+      // to determine the number of incoming phases.
+      //
+      
+      GadgetStreamController *controller = this->get_controller();
+    
+      if( controller == 0x0 ){
+        GADGET_DEBUG1("Failed to get controller\n");
+        return GADGET_FAIL;
+      }
+      
+      PhysioInterpolationGadget *physio = 
+        dynamic_cast<PhysioInterpolationGadget*>( controller->find_gadget(std::string("PhysioInterpolationGadget")) );
+      
+      if( physio == 0x0 ){
+        GADGET_DEBUG1("Could not find (or cast) PhysioInterpolationGadget in gadget stream\n");
+        return GADGET_FAIL;
+      }
+      
+      this->number_of_phases_ = physio->get_number_of_phases();      
+      
+      GADGET_DEBUG2("Configured for %d phases\n", this->number_of_phases_); 
+      return GADGET_OK;
+    }
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1,
+                         GadgetContainerMessage< hoNDArray< typename ARRAY_TYPE::element_type > > *m2 )
+    {
+
+      //GADGET_DEBUG2("\nSERIES: %d, PHASE: %d", m1->getObjectPtr()->image_series_index, m1->getObjectPtr()->phase );
+
+      // If this image header corresponds to series 0, it is not part of the sorted phases.
+      // Just pass those images along...
+      //
+
+      if( m1->getObjectPtr()->image_series_index < 9 ){
+        return this->next()->putq(m1);
+      }
+      
+      // At first pass allocate the image buffer array.
+      //
+      
+      if( this->phase_images_.get() == 0x0 ){
+      
+        this->image_dimensions_ = *m2->getObjectPtr()->get_dimensions();
+        this->phase_images_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >
+          (new ACE_Message_Queue<ACE_MT_SYNCH>[this->number_of_phases_]);      
+	
+        size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*this->number_of_phases_;
+	
+        for( unsigned int i=0; i<this->number_of_phases_; i++ ){
+          this->phase_images_[i].high_water_mark(bsize);
+          this->phase_images_[i].low_water_mark(bsize);      
+        }
+	
+        // Setup the optical flow solver
+        //
+	
+        if( this->setup_solver() != GADGET_OK ){
+          GADGET_DEBUG1("Failed to set up optical flow solver\n");
+          return GADGET_FAIL;
+        }
+      }
+      
+      //
+      // Put the incoming images on the appropriate queue (based on the phase index).
+      // 
+      
+      unsigned int phase = m1->getObjectPtr()->phase;
+      
+      if( this->phase_images_[phase].enqueue_tail(m1) < 0 ) {
+        GADGET_DEBUG1("Failed to add image to buffer\n");
+        return GADGET_FAIL;
+      }
+
+      return GADGET_OK;
+    }
+
+    // All the work is done here in the close method
+    //
+
+    virtual int close(unsigned long flags)
+    {
+      if( this->phase_images_.get() ){
+      
+        GADGET_DEBUG1("RegistrationAveragingGadget::close (performing registration and averaging images)\n");
+      
+        // Make sure we have the same number of images on all phase queues
+        // (It doesn't really matter, but if not the case something probably went wrong upstream)
+        //
+
+        unsigned int num_images = this->phase_images_[0].message_count();
+
+        GADGET_DEBUG2("Number of images for phase 0: %d", num_images );
+        
+        for( unsigned int phase = 0; phase< this->number_of_phases_; phase++ ){
+
+          unsigned int num_images_phase = this->phase_images_[phase].message_count();
+          GADGET_DEBUG2("Number of images for phase %d: %d", phase, num_images_phase );
+
+          if( num_images != num_images_phase ){
+            GADGET_DEBUG1("Failed to set up registration, a different number of images received for each phase\n");
+            return Gadget::close(flags);
+          }
+        }
+      
+        if( num_images == 0 ){
+          GADGET_DEBUG1("No images to register\n");
+          return Gadget::close(flags);
+        }
+
+        for( unsigned int phase=0; phase < this->number_of_phases_; phase++ ){
+	
+          unsigned int num_image_elements = this->image_dimensions_[0]*image_dimensions_[1];
+          std::vector<size_t> moving_dims = this->image_dimensions_;
+          moving_dims.push_back(num_images-1);
+	
+          GadgetContainerMessage<ISMRMRD::ImageHeader> *header;
+
+          ARRAY_TYPE fixed_image;
+          ARRAY_TYPE moving_image(&moving_dims);
+	
+          for( unsigned int image=0; image<num_images; image++ ){
+	  
+            ACE_Message_Block *mbq;
+	  
+            if( this->phase_images_[phase].dequeue_head(mbq) < 0 ) {
+              GADGET_DEBUG1("Image header dequeue failed\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage<ISMRMRD::ImageHeader> *m1 = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	  
+            if( m1 == 0x0 ) {
+              GADGET_DEBUG1("Unexpected image type on queue\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> > *m2 = 
+              AsContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> >(m1->cont());
+	  
+            if( m2 == 0x0 ) {
+              GADGET_DEBUG1("Unexpected continuation on queue\n");
+              m1->release();
+              return Gadget::close(flags);
+            }
+	  
+            if( image == 0 ){
+
+              // Setup the fixed image.
+              // If ARRAY_TYPE is an cuNDArray the following assignment uploads the array to the device,
+              // for an 'hoNDArray' it merely copies the array.
+              fixed_image = *m2->getObjectPtr();
+
+              // We are going to pass on the averaged image using this header
+              header = m1; 
+
+              // The continuation will be a new array (set after registration).
+              // No registration is however performed if we received only one image. 
+              // In the latter case keep the current continuation.
+              if( num_images > 1 ){	      
+                m1->cont(0x0); 
+                m2->release();
+              }
+            }
+            else{
+
+              // Assign this image as the 'image-1'th frame in the moving image
+              ARRAY_TYPE tmp_moving(&image_dimensions_, moving_image.get_data_ptr()+(image-1)*num_image_elements);
+              tmp_moving = *m2->getObjectPtr(); // Copy as for the fixed image
+              m1->release();	    
+            }
+          }
+	
+          if( num_images > 1 ){
+	  
+            // Perform registration for the current phase
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformations;
+            {
+              GadgetronTimer timer("Running registration");
+              deformations = this->of_solver_->solve( &fixed_image, &moving_image );
+            }
+
+            // Deform moving images based on the registration
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformed_moving;
+            {
+              GadgetronTimer timer("Applying deformation");
+              deformed_moving = this->of_solver_->deform( &moving_image, deformations );
+            }
+	  
+            /*{
+            // The debug code below only compiles for cuNDArrays.
+            // To use (temporarily) comment out
+            // list(APPEND CPU_GADGETS cpuRegistrationAveragingGadget.cpp)
+            // in the CMakeList.txt
+            //
+            char filename[256];
+            sprintf((char*)filename, "fixed_%d.real", phase);
+            write_nd_array<float>( fixed_image.to_host().get(), filename );
+            sprintf((char*)filename, "moving_%d.real", phase);
+            write_nd_array<float>( moving_image.to_host().get(), filename );
+            sprintf((char*)filename, "deformed_moving_%d.real", phase);
+            write_nd_array<float>( deformed_moving->to_host().get(), filename );
+            sprintf((char*)filename, "deformation_%d.real", phase);
+            write_nd_array<float>( deformations->to_host().get(), filename );
+            } */
+
+	 
+            // Accumulate the deformed moving images (into one image) and add this image to the fixed image. 
+            // Then divide by the number of images to get the average.
+            //	  
+	  
+            fixed_image += ((deformed_moving->get_number_of_dimensions() == 3) ? *sum(deformed_moving.get(), 2) : *deformed_moving);
+            fixed_image /= ((typename ARRAY_TYPE::element_type)num_images);
+	  
+            // Pass along averaged image
+            //
+	  
+            if( set_continuation( header, &fixed_image ) < 0 ) {
+              GADGET_DEBUG1("Failed to set continuation\n");
+              header->release();
+              return Gadget::close(flags);
+            }
+          }
+
+          if( this->next()->putq(header) < 0 ) {
+            GADGET_DEBUG1("Failed to put registrered image on queue\n");
+            header->release();
+            return Gadget::close(flags);
+          }
+        }
+      }
+    
+      return Gadget::close(flags);
+    }
+
+    virtual int setup_solver() = 0;
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, ARRAY_TYPE *continuation ) = 0;
+
+  protected:
+    opticalFlowSolver<ARRAY_TYPE,D> *of_solver_;
+    typename ARRAY_TYPE::element_type alpha_;
+    typename ARRAY_TYPE::element_type beta_;
+    typename ARRAY_TYPE::element_type limit_;
+    bool output_convergence_;
+    unsigned int num_multires_levels_;
+    unsigned int max_iterations_per_level_;
+
+  private:
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > phase_images_;
+    std::vector<size_t> image_dimensions_;
+    unsigned short number_of_phases_;    
+  };
+}
+
+#endif //RegistrationAveragingGadget_H
diff --git a/gadgets/moco/RegistrationScatteringGadget.h b/gadgets/moco/RegistrationScatteringGadget.h
new file mode 100644
index 0000000..6dfc2a5
--- /dev/null
+++ b/gadgets/moco/RegistrationScatteringGadget.h
@@ -0,0 +1,375 @@
+#ifndef RegistrationScatteringGadget_H
+#define RegistrationScatteringGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "complext.h"
+#include "PhysioInterpolationGadget.h"
+#include "GadgetStreamController.h"
+#include "GadgetronTimer.h"
+#include "gadgetron_moco_export.h"
+#include "hoNDArray_fileio.h"
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{  
+
+  template<class ARRAY_TYPE, unsigned int D> class opticalFlowSolver;
+  
+  /**
+     This is an abstract gadget class and consequently should not be included in any xml configuration file.
+     Use instead the gpuRegistrationScatteringGadget.
+  */
+  template<class ARRAY_TYPE, unsigned int D> class EXPORTGADGETS_MOCO RegistrationScatteringGadget 
+    : public Gadget2<ISMRMRD::ImageHeader, hoNDArray< typename ARRAY_TYPE::element_type > > // se note below
+  {
+    //
+    // We use hoNDArray to interface the gadget chain, even if ARRAY_TYPE is a cuNDArray
+    // Instead of hard coding the interface to use single precision (float), 
+    // "typename ARRAY_TYPE::element_type" could in principle denote a double precison type (double) as well.
+    // Registration of complex images is however not supported currently.
+    //
+    
+  public:
+    
+    RegistrationScatteringGadget() {
+      this->of_solver_ = 0x0;
+      this->number_of_phases_ = 0; // This is a property queried from the PhysioInterpolationGadget
+      this->set_parameter(std::string("alpha").c_str(), "0.05");
+      this->set_parameter(std::string("beta").c_str(), "1.0");
+      this->set_parameter(std::string("limit").c_str(), "0.01");
+      this->set_parameter(std::string("num_multiresolution_levels").c_str(), "3");
+      this->set_parameter(std::string("max_iterations_per_level").c_str(), "500");    
+      this->set_parameter(std::string("output_convergence").c_str(), "false");
+    }
+
+    virtual ~RegistrationScatteringGadget() {
+      if( this->of_solver_ ) delete this->of_solver_;
+    }
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block *mb)
+    {
+      this->alpha_ = (typename ARRAY_TYPE::element_type)this->get_double_value("alpha");
+      this->beta_  = (typename ARRAY_TYPE::element_type)this->get_double_value("beta");
+      this->limit_ = (typename ARRAY_TYPE::element_type)this->get_double_value("limit");
+      this->output_convergence_ = this->get_bool_value(std::string("output_convergence").c_str());
+      this->num_multires_levels_ = this->get_int_value(std::string("num_multiresolution_levels").c_str());
+      this->max_iterations_per_level_ = this->get_int_value(std::string("max_iterations_per_level").c_str());
+      
+      // Fow now we require the existence of a gadget named "PhysioInterpolationGadget" upstream,
+      // to determine the number of incoming phases.
+      //
+      
+      GadgetStreamController *controller = this->get_controller();
+    
+      if( controller == 0x0 ){
+        GADGET_DEBUG1("Failed to get controller\n");
+        return GADGET_FAIL;
+      }
+      
+      PhysioInterpolationGadget *physio = 
+        dynamic_cast<PhysioInterpolationGadget*>( controller->find_gadget(std::string("PhysioInterpolationGadget")) );
+      
+      if( physio == 0x0 ){
+        GADGET_DEBUG1("Could not find (or cast) PhysioInterpolationGadget in gadget stream\n");
+        return GADGET_FAIL;
+      }
+      
+      this->number_of_phases_ = physio->get_number_of_phases();      
+      
+      GADGET_DEBUG2("Configured for %d phases\n", this->number_of_phases_); 
+      return GADGET_OK;
+    }
+    
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1,
+                         GadgetContainerMessage< hoNDArray< typename ARRAY_TYPE::element_type > > *m2 )
+    {
+
+      //GADGET_DEBUG2("\nSERIES: %d, PHASE: %d", m1->getObjectPtr()->image_series_index, m1->getObjectPtr()->phase );
+
+      // If this image header corresponds to series 0, it is not part of the sorted phases.
+      // Just pass those images along...
+      //
+
+      if( m1->getObjectPtr()->image_series_index < 9 ){
+        return this->next()->putq(m1);
+      }
+      
+      // At first pass allocate the image buffer array.
+      //
+      
+      if( this->phase_images_.get() == 0x0 ){
+      
+        this->image_dimensions_ = *m2->getObjectPtr()->get_dimensions();
+        this->phase_images_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >
+          (new ACE_Message_Queue<ACE_MT_SYNCH>[this->number_of_phases_]);      
+        
+        size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*this->number_of_phases_;
+        
+        for( unsigned int i=0; i<this->number_of_phases_; i++ ){
+          this->phase_images_[i].high_water_mark(bsize);
+          this->phase_images_[i].low_water_mark(bsize);      
+        }
+        
+        // Setup the optical flow solver
+        //
+        
+        if( this->setup_solver() != GADGET_OK ){
+          GADGET_DEBUG1("Failed to set up optical flow solver\n");
+          return GADGET_FAIL;
+        }
+      }
+      
+      //
+      // Put the incoming images on the appropriate queue (based on the phase index).
+      // 
+      
+      unsigned int phase = m1->getObjectPtr()->phase;
+      
+      if( this->phase_images_[phase].enqueue_tail(m1) < 0 ) {
+        GADGET_DEBUG1("Failed to add image to buffer\n");
+        return GADGET_FAIL;
+      }
+      
+      return GADGET_OK;
+    }
+    
+    // All the work is done here in the close method
+    //
+    virtual int close(unsigned long flags)
+    {
+      if( this->phase_images_.get() ){
+      
+        GADGET_DEBUG1("RegistrationScatteringGadget::close (performing registration and scattering images)\n");
+      
+        // Make sure we have the same number of images on all phase queues
+        // (It doesn't really matter, but if not the case something probably went wrong upstream)
+        //
+
+        unsigned int num_images = this->phase_images_[0].message_count();
+
+        GADGET_DEBUG2("Number of images for phase 0: %d", num_images );
+
+        for( unsigned int phase = 0; phase< this->number_of_phases_; phase++ ){
+
+          unsigned int num_images_phase = this->phase_images_[phase].message_count();
+          GADGET_DEBUG2("Number of images for phase %d: %d", phase, num_images_phase );
+
+          if( num_images != num_images_phase ){
+            GADGET_DEBUG1("Failed to set up registration, a different number of images received for each phase\n");
+            return Gadget::close(flags);
+          }
+        }
+      
+        if( num_images == 0 ){
+          GADGET_DEBUG1("No images to register\n");
+          return Gadget::close(flags);
+        }
+
+        // These are the dimensions of the vector field written out
+        // - just a plain 'write_nd_array' below for now...
+        //
+
+        std::vector<size_t> reg_dims = this->image_dimensions_; // x,y
+        reg_dims.push_back(num_images-1); // this many registrations 
+        reg_dims.push_back(2); // 2d flow vectors
+        reg_dims.push_back(this->number_of_phases_);
+        ARRAY_TYPE reg_field(&reg_dims);
+        unsigned int num_reg_elements_phase = reg_dims[0]*reg_dims[1]*reg_dims[2]*reg_dims[3];
+
+        for( unsigned int phase=0; phase < this->number_of_phases_; phase++ ){
+	
+          unsigned int num_image_elements = this->image_dimensions_[0]*image_dimensions_[1];
+          std::vector<size_t> fixed_dims = this->image_dimensions_;
+          fixed_dims.push_back(num_images-1);
+	
+          std::vector< GadgetContainerMessage<ISMRMRD::ImageHeader>*> headers;
+
+          ARRAY_TYPE fixed_image(&fixed_dims);
+          ARRAY_TYPE moving_image;
+	
+          for( unsigned int image=0; image<num_images; image++ ){
+	  
+            ACE_Message_Block *mbq;
+	  
+            if( this->phase_images_[phase].dequeue_head(mbq) < 0 ) {
+              GADGET_DEBUG1("Image header dequeue failed\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage<ISMRMRD::ImageHeader> *m1 = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	  
+            if( m1 == 0x0 ) {
+              GADGET_DEBUG1("Unexpected image type on queue\n");
+              return Gadget::close(flags);
+            }
+	  
+            GadgetContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> > *m2 = 
+              AsContainerMessage< hoNDArray<typename ARRAY_TYPE::element_type> >(m1->cont());
+	  
+            if( m2 == 0x0 ) {
+              GADGET_DEBUG1("Unexpected continuation on queue\n");
+              m1->release();
+              return Gadget::close(flags);
+            }
+	  
+            if( image == 0 ){
+
+              // Setup the moving image.
+              // If ARRAY_TYPE is an cuNDArray the following assignment uploads the array to the device,
+              // for an 'hoNDArray' it merely copies the array.
+              //
+
+              moving_image = *m2->getObjectPtr();
+              headers.push_back(m1);
+            }
+            else{
+
+              // Assign this image as the 'image-1'th frame in the moving image
+              //
+
+              ARRAY_TYPE tmp_fixed(&image_dimensions_, fixed_image.get_data_ptr()+(image-1)*num_image_elements);
+              tmp_fixed = *m2->getObjectPtr(); // Copy as for the moving image
+              headers.push_back(m1);
+
+              // The continuation will be a new array (set after registration).
+              // No registration is however performed if we received only one image. 
+              // In the latter case keep the current continuation.
+              //
+
+              if( num_images > 1 ){
+                m1->cont(0x0);
+                m2->release();
+              }             
+            }
+          }
+	
+          if( num_images > 1 ){
+	  
+            // Perform registration for the current phase
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformations;
+            {
+              GadgetronTimer timer("Running registration");
+              deformations = this->of_solver_->solve( &fixed_image, &moving_image );
+            }
+
+            // Copy displacement field to vector field array
+            //
+            
+            {              
+              std::vector<size_t> phase_reg_dims = reg_dims; phase_reg_dims.pop_back();
+              ARRAY_TYPE tmp_in( &phase_reg_dims, deformations->get_data_ptr() ); // the vector field has an extra dimension for CK (to be discarded)
+              ARRAY_TYPE tmp_out( &phase_reg_dims, reg_field.get_data_ptr()+phase*num_reg_elements_phase );
+              tmp_out = tmp_in;
+            }
+
+            // Deform moving images based on the registration
+            //
+	  
+            boost::shared_ptr<ARRAY_TYPE> deformed_moving;
+            {
+              GadgetronTimer timer("Applying deformation");
+              deformed_moving = this->of_solver_->deform( &moving_image, deformations );
+            }
+	  
+            /*{
+            // The debug code below only compiles for cuNDArrays.
+            // To use (temporarily) comment out
+            // list(APPEND CPU_GADGETS cpuRegistrationScatteringGadget.cpp)
+            // in the CMakeList.txt
+            //
+            char filename[256];
+            sprintf((char*)filename, "fixed_%d.real", phase);
+            write_nd_array<float>( fixed_image.to_host().get(), filename );
+            sprintf((char*)filename, "moving_%d.real", phase);
+            write_nd_array<float>( moving_image.to_host().get(), filename );
+            sprintf((char*)filename, "deformed_moving_%d.real", phase);
+            write_nd_array<float>( deformed_moving->to_host().get(), filename );
+            sprintf((char*)filename, "deformation_%d.real", phase);
+            write_nd_array<float>( deformations->to_host().get(), filename );
+            } */
+
+
+            // Pass along the deformed moving images
+            //	  
+	  
+            for( unsigned int i=0; i<headers.size(); i++ ){
+              
+              if( i==0 ){
+                GADGET_DEBUG2("Putting image %d image on queue\n", i);
+                
+                if( this->next()->putq(headers[i]) < 0 ) {
+                  GADGET_DEBUG1("Failed to put registrered image on queue\n");
+                  headers[i]->release();
+                  return Gadget::close(flags);
+                }
+              }
+              else{                
+                std::vector<size_t> moving_dims = *moving_image.get_dimensions();
+                cuNDArray<float> subimage( &moving_dims, deformed_moving->get_data_ptr()+(i-1)*num_image_elements);
+                
+                if( set_continuation( headers[i], &subimage ) < 0 ) {
+                  GADGET_DEBUG1("Failed to set continuation\n");
+                  headers[i]->release();
+                  return Gadget::close(flags);
+                }
+                
+                GADGET_DEBUG2("Putting image %d image on queue\n", i);
+                
+                if( this->next()->putq(headers[i]) < 0 ) {
+                  GADGET_DEBUG1("Failed to put registrered image on queue\n");
+                  headers[i]->release();
+                  return Gadget::close(flags);
+                }
+              }
+            }
+          }
+        }
+        
+        // Write out the result after permutation to the data order
+        // - to be betetr suited for a subsequent reconstruction pass
+        //
+        
+        std::vector<size_t> order;
+        order.push_back(0); 
+        order.push_back(1);
+        order.push_back(4);
+        order.push_back(2);
+        order.push_back(3);
+        
+        GADGET_DEBUG2("Writing out displacement field with dimensions: %d %d %d %d %d\n", order[0], order[1], order[2], order[3], order[4]);
+        write_displacement_field( permute(&reg_field, &order).get() );
+      }
+      
+      return Gadget::close(flags);
+    }
+    
+    virtual int setup_solver() = 0;
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, ARRAY_TYPE *continuation ) = 0;
+    virtual int write_displacement_field( ARRAY_TYPE *vec_field ) = 0;
+    
+  protected:
+    opticalFlowSolver<ARRAY_TYPE,D> *of_solver_;
+    typename ARRAY_TYPE::element_type alpha_;
+    typename ARRAY_TYPE::element_type beta_;
+    typename ARRAY_TYPE::element_type limit_;
+    bool output_convergence_;
+    unsigned int num_multires_levels_;
+    unsigned int max_iterations_per_level_;
+    
+  private:
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > phase_images_;
+    std::vector<size_t> image_dimensions_;
+    unsigned short number_of_phases_;    
+  };
+}
+
+#endif //RegistrationScatteringGadget_H
diff --git a/gadgets/moco/config/CMakeLists.txt b/gadgets/moco/config/CMakeLists.txt
new file mode 100644
index 0000000..b0191d7
--- /dev/null
+++ b/gadgets/moco/config/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(CUDA_FOUND)
+  if(CPU_REG)
+    install (FILES 
+      cpureg_cartesian_averaging.xml
+      DESTINATION config)
+  endif(CPU_REG)
+  
+  if(GPU_REG)
+    install (FILES 
+      gpureg_cartesian_averaging.xml
+      DESTINATION config)
+  endif(GPU_REG)
+endif(CUDA_FOUND)
diff --git a/gadgets/moco/config/cpureg_cartesian_averaging.xml b/gadgets/moco/config/cpureg_cartesian_averaging.xml
new file mode 100644
index 0000000..08f1990
--- /dev/null
+++ b/gadgets/moco/config/cpureg_cartesian_averaging.xml
@@ -0,0 +1,130 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>CartesianToGenericGadget</name>
+    <dll>gadgetron_cartesian</dll>
+    <classname>CartesianToGenericGadget</classname>
+    <!-- Property 'matrix_size_as_a_multiple_of' is required for the gpu nfft 
+	 to enforce the matrix size to be a multiple of the gpu warp size (32) -->
+    <property><name>matrix_size_as_a_multiple_of</name><value>32</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>8</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.05</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhysioInterpolationGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PhysioInterpolationGadget</classname>
+    <property><name>mode</name><value>0</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>cpuRegistrationAveragingGadget2D</name>
+    <dll>gadgetron_moco</dll>
+    <classname>cpuRegistrationAveragingGadget2D</classname>
+    <property><name>alpha</name><value>0.05</value></property>
+    <property><name>beta</name><value>1.0</value></property>
+    <property><name>num_multiresolution_levels</name><value>3</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishFloat</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+      </gadget>    
+  -->
+  
+  <gadget>
+    <name>AutoScale</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>ImageFinishUSHORT</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>
+  
+</gadgetronStreamConfiguration>
diff --git a/gadgets/moco/config/gpureg_cartesian_averaging.xml b/gadgets/moco/config/gpureg_cartesian_averaging.xml
new file mode 100644
index 0000000..ad1c727
--- /dev/null
+++ b/gadgets/moco/config/gpureg_cartesian_averaging.xml
@@ -0,0 +1,130 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>CartesianToGenericGadget</name>
+    <dll>gadgetron_cartesian</dll>
+    <classname>CartesianToGenericGadget</classname>
+    <!-- Property 'matrix_size_as_a_multiple_of' is required for the gpu nfft 
+	 to enforce the matrix size to be a multiple of the gpu warp size (32) -->
+    <property><name>matrix_size_as_a_multiple_of</name><value>32</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>8</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.05</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhysioInterpolationGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PhysioInterpolationGadget</classname>
+    <property><name>mode</name><value>0</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>gpuRegistrationAveragingGadget2D</name>
+    <dll>gadgetron_moco</dll>
+    <classname>gpuRegistrationAveragingGadget2D</classname>
+    <property><name>alpha</name><value>0.05</value></property>
+    <property><name>beta</name><value>1.0</value></property>
+    <property><name>num_multiresolution_levels</name><value>3</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishFloat</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+      </gadget>    
+  -->
+  
+  <gadget>
+    <name>AutoScale</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>ImageFinishUSHORT</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>
+  
+</gadgetronStreamConfiguration>
diff --git a/gadgets/moco/cpuRegistrationAveragingGadget.cpp b/gadgets/moco/cpuRegistrationAveragingGadget.cpp
new file mode 100644
index 0000000..e3eed47
--- /dev/null
+++ b/gadgets/moco/cpuRegistrationAveragingGadget.cpp
@@ -0,0 +1,44 @@
+#include "cpuRegistrationAveragingGadget.h"
+#include "hoLinearResampleOperator.h"
+#include "hoCKOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  int cpuRegistrationAveragingGadget2D::setup_solver()
+  {
+    // Allocate solver
+    hoCKOpticalFlowSolver<float,2> *solver = new hoCKOpticalFlowSolver<float,2>();
+    this->of_solver_ = solver;
+
+    // Use bilinear resampling for interpolation
+    solver->set_interpolator( boost::shared_ptr< hoLinearResampleOperator<float,2> >(new hoLinearResampleOperator<float,2>()) );
+    
+    // Configurable settings from the xml propoerties
+    //
+    
+    if( this->output_convergence_ )
+      solver->set_output_mode( hoCKOpticalFlowSolver<float,2>::OUTPUT_VERBOSE );
+    else
+      solver->set_output_mode( hoCKOpticalFlowSolver<float,2>::OUTPUT_SILENT );
+    
+    solver->set_num_multires_levels(this->num_multires_levels_);
+    solver->set_max_num_iterations_per_level(this->max_iterations_per_level_);
+    solver->set_alpha(this->alpha_);
+    solver->set_beta(this->beta_);
+    solver->set_limit(this->limit_);
+
+    return GADGET_OK;
+  }
+
+  int cpuRegistrationAveragingGadget2D::set_continuation
+  ( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, hoNDArray<float> *continuation )
+  {
+    GadgetContainerMessage< hoNDArray<float> > *m2 = new GadgetContainerMessage< hoNDArray<float> >();      
+    *m2->getObjectPtr() = *continuation;
+    m1->cont(m2);
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(cpuRegistrationAveragingGadget2D)
+}
diff --git a/gadgets/moco/cpuRegistrationAveragingGadget.h b/gadgets/moco/cpuRegistrationAveragingGadget.h
new file mode 100644
index 0000000..c01d40b
--- /dev/null
+++ b/gadgets/moco/cpuRegistrationAveragingGadget.h
@@ -0,0 +1,28 @@
+#ifndef cpuRegistrationAveragingGadget_H
+#define cpuRegistrationAveragingGadget_H
+
+#include "hoNDArray_operators.h"
+#include "hoNDArray_utils.h"
+#include "hoRegistration_utils.h"
+#include "hoCKOpticalFlowSolver.h"
+#include "RegistrationAveragingGadget.h"
+
+namespace Gadgetron{  
+
+  class EXPORTGADGETS_MOCO cpuRegistrationAveragingGadget2D :
+    public RegistrationAveragingGadget< hoNDArray<float>, 2 >
+  {    
+
+  public:
+    GADGET_DECLARE(cpuRegistrationAveragingGadget2D);
+    
+    cpuRegistrationAveragingGadget2D() : RegistrationAveragingGadget< hoNDArray<float>, 2 >() {}
+    virtual ~cpuRegistrationAveragingGadget2D() {}
+
+  protected:
+    virtual int setup_solver();
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, hoNDArray<float> *continuation );
+  };
+}
+
+#endif //cpuRegistrationAveragingGadget_H
diff --git a/gadgets/moco/gadgetron_moco_export.h b/gadgets/moco/gadgetron_moco_export.h
new file mode 100644
index 0000000..95bc7c5
--- /dev/null
+++ b/gadgets/moco/gadgetron_moco_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_MOCO_EXPORT_H_
+#define GADGETRON_MOCO_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_MOCO__)
+#define EXPORTGADGETS_MOCO __declspec(dllexport)
+#else
+#define EXPORTGADGETS_MOCO __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_MOCO
+#endif
+
+#endif /* GADGETRON_MOCO_EXPORT_H_ */
diff --git a/gadgets/moco/gpuRegistrationAveragingGadget.cpp b/gadgets/moco/gpuRegistrationAveragingGadget.cpp
new file mode 100644
index 0000000..4c3c8e8
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationAveragingGadget.cpp
@@ -0,0 +1,50 @@
+#include "gpuRegistrationAveragingGadget.h"
+#include "cuLinearResampleOperator.h"
+#include "cuCKOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  int gpuRegistrationAveragingGadget2D::setup_solver()
+  {
+    // Allocate solver
+    cuCKOpticalFlowSolver<float,2> *solver = new cuCKOpticalFlowSolver<float,2>();
+    this->of_solver_ = solver;
+
+    // Use bilinear resampling for interpolation
+    solver->set_interpolator( boost::shared_ptr< cuLinearResampleOperator<float,2> >(new cuLinearResampleOperator<float,2>()) );
+    
+    // Configurable settings from the xml propoerties
+    //
+    
+    if( this->output_convergence_ )
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_VERBOSE );
+    else
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_SILENT );
+    
+    solver->set_num_multires_levels(this->num_multires_levels_);
+    solver->set_max_num_iterations_per_level(this->max_iterations_per_level_);
+    solver->set_alpha(this->alpha_);
+    solver->set_beta(this->beta_);
+    solver->set_limit(this->limit_);
+
+    return GADGET_OK;
+  }
+
+  int gpuRegistrationAveragingGadget2D::set_continuation
+  ( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, cuNDArray<float> *continuation )
+  {
+    GadgetContainerMessage< hoNDArray<float> > *m2 = new GadgetContainerMessage< hoNDArray<float> >();      
+    m2->getObjectPtr()->create(continuation->get_dimensions());
+    
+    if( cudaMemcpy( m2->getObjectPtr()->get_data_ptr(), continuation->get_data_ptr(), 
+		    continuation->get_number_of_elements()*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+      throw cuda_error("gpuRegistrationAveragingGadget::set_continuation(): failed to copy memory from device");
+    }
+
+    m1->cont(m2);
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRegistrationAveragingGadget2D)
+}
diff --git a/gadgets/moco/gpuRegistrationAveragingGadget.h b/gadgets/moco/gpuRegistrationAveragingGadget.h
new file mode 100644
index 0000000..495e53e
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationAveragingGadget.h
@@ -0,0 +1,27 @@
+#ifndef gpuRegistrationAveragingGadget_H
+#define gpuRegistrationAveragingGadget_H
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuCKOpticalFlowSolver.h"
+#include "RegistrationAveragingGadget.h"
+
+namespace Gadgetron{  
+
+  class EXPORTGADGETS_MOCO gpuRegistrationAveragingGadget2D :
+    public RegistrationAveragingGadget< cuNDArray<float>, 2 >
+  {    
+
+  public:
+    GADGET_DECLARE(gpuRegistrationAveragingGadget2D);
+    
+    gpuRegistrationAveragingGadget2D() : RegistrationAveragingGadget< cuNDArray<float>, 2 >() {}
+    virtual ~gpuRegistrationAveragingGadget2D() {}
+
+  protected:
+    virtual int setup_solver();
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, cuNDArray<float> *continuation );
+  };
+}
+
+#endif //gpuRegistrationAveragingGadget_H
diff --git a/gadgets/moco/gpuRegistrationScatteringGadget.cpp b/gadgets/moco/gpuRegistrationScatteringGadget.cpp
new file mode 100644
index 0000000..eeadc0d
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationScatteringGadget.cpp
@@ -0,0 +1,57 @@
+#include "gpuRegistrationScatteringGadget.h"
+#include "cuLinearResampleOperator.h"
+#include "cuCKOpticalFlowSolver.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+  int gpuRegistrationScatteringGadget2D::setup_solver()
+  {
+    // Allocate solver
+    cuCKOpticalFlowSolver<float,2> *solver = new cuCKOpticalFlowSolver<float,2>();
+    this->of_solver_ = solver;
+
+    // Use bilinear resampling for interpolation
+    solver->set_interpolator( boost::shared_ptr< cuLinearResampleOperator<float,2> >(new cuLinearResampleOperator<float,2>()) );
+    
+    // Configurable settings from the xml propoerties
+    //
+    
+    if( this->output_convergence_ )
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_VERBOSE );
+    else
+      solver->set_output_mode( cuCKOpticalFlowSolver<float,2>::OUTPUT_SILENT );
+    
+    solver->set_num_multires_levels(this->num_multires_levels_);
+    solver->set_max_num_iterations_per_level(this->max_iterations_per_level_);
+    solver->set_alpha(this->alpha_);
+    solver->set_beta(this->beta_);
+    solver->set_limit(this->limit_);
+
+    return GADGET_OK;
+  }
+
+  int gpuRegistrationScatteringGadget2D::set_continuation
+  ( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, cuNDArray<float> *continuation )
+  {
+    GadgetContainerMessage< hoNDArray<float> > *m2 = new GadgetContainerMessage< hoNDArray<float> >();      
+    m2->getObjectPtr()->create(continuation->get_dimensions());
+    
+    if( cudaMemcpy( m2->getObjectPtr()->get_data_ptr(), continuation->get_data_ptr(), 
+		    continuation->get_number_of_elements()*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess) {
+      throw cuda_error("gpuRegistrationScatteringGadget::set_continuation(): failed to copy memory from device");
+    }
+
+    m1->cont(m2);
+
+    return GADGET_OK;
+  }
+  
+  int gpuRegistrationScatteringGadget2D::write_displacement_field( cuNDArray<float> *displacements )
+  {
+    write_nd_array<float>(displacements->to_host().get(), "displacement_field_from_scattering_gadget.real");
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRegistrationScatteringGadget2D)
+}
diff --git a/gadgets/moco/gpuRegistrationScatteringGadget.h b/gadgets/moco/gpuRegistrationScatteringGadget.h
new file mode 100644
index 0000000..5e7ea66
--- /dev/null
+++ b/gadgets/moco/gpuRegistrationScatteringGadget.h
@@ -0,0 +1,28 @@
+#ifndef gpuRegistrationScatteringGadget_H
+#define gpuRegistrationScatteringGadget_H
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_utils.h"
+#include "cuCKOpticalFlowSolver.h"
+#include "RegistrationScatteringGadget.h"
+
+namespace Gadgetron{  
+
+  class EXPORTGADGETS_MOCO gpuRegistrationScatteringGadget2D :
+    public RegistrationScatteringGadget< cuNDArray<float>, 2 >
+  {    
+
+  public:
+    GADGET_DECLARE(gpuRegistrationScatteringGadget2D);
+    
+    gpuRegistrationScatteringGadget2D() : RegistrationScatteringGadget< cuNDArray<float>, 2 >() {}
+    virtual ~gpuRegistrationScatteringGadget2D() {}
+
+  protected:
+    virtual int setup_solver();
+    virtual int set_continuation( GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, cuNDArray<float> *continuation );
+    virtual int write_displacement_field( cuNDArray<float> *displacements );
+  };
+}
+
+#endif //gpuRegistrationScatteringGadget_H
diff --git a/gadgets/mri_core/AccumulatorGadget.cpp b/gadgets/mri_core/AccumulatorGadget.cpp
new file mode 100644
index 0000000..ebd9166
--- /dev/null
+++ b/gadgets/mri_core/AccumulatorGadget.cpp
@@ -0,0 +1,184 @@
+#include "AccumulatorGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron{
+AccumulatorGadget::AccumulatorGadget()
+  :buffer_(0)
+  , image_counter_(0)
+  , image_series_(0)
+{
+
+}
+ 
+AccumulatorGadget::~AccumulatorGadget()
+{
+  if (buffer_) delete buffer_;
+}
+
+/**
+ *   Expects ISMRMRD XML configuration
+ *
+ */
+int AccumulatorGadget::process_config(ACE_Message_Block* mb)
+{
+
+	boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+	ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+	if (e_seq.size() != 1) {
+		GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+		GADGET_DEBUG1("This simple AccumulatorGadget only supports one encoding space\n");
+		return GADGET_FAIL;
+	}
+
+	ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+	ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+	ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+	GADGET_DEBUG2("Matrix size: %d, %d, %d\n", e_space.matrixSize().x(), e_space.matrixSize().y(), e_space.matrixSize().z());
+	dimensions_.push_back(e_space.matrixSize().x());
+	dimensions_.push_back(e_space.matrixSize().y());
+	dimensions_.push_back(e_space.matrixSize().z());
+
+    field_of_view_.push_back(e_space.fieldOfView_mm().x());
+    field_of_view_.push_back(e_space.fieldOfView_mm().y());
+    field_of_view_.push_back(e_space.fieldOfView_mm().z());
+    GADGET_DEBUG2("FOV: %f, %f, %f\n", e_space.fieldOfView_mm().x(), e_space.fieldOfView_mm().y(), e_space.fieldOfView_mm().z());
+
+	slices_ = e_limits.slice().present() ? e_limits.slice().get().maximum()+1 : 1;
+
+  return GADGET_OK;
+}
+
+int AccumulatorGadget::
+process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+  if (!buffer_) {
+	  dimensions_.push_back(m1->getObjectPtr()->active_channels);
+	  dimensions_.push_back(slices_);
+
+	  if (!(buffer_ = new hoNDArray< std::complex<float> >())) {
+		  GADGET_DEBUG1("Failed create buffer\n");
+		  return GADGET_FAIL;
+	  }
+
+	  try {buffer_->create(&dimensions_);}
+	  catch (std::runtime_error &err){
+		  GADGET_DEBUG_EXCEPTION(err,"Failed allocate buffer array\n");
+		  return GADGET_FAIL;
+	  }
+
+	  image_series_ = this->get_int_value("image_series");
+
+  }
+
+
+  std::complex<float>* b =
+		  buffer_->get_data_ptr();
+
+  std::complex<float>* d =
+		  m2->getObjectPtr()->get_data_ptr();
+
+  int samples =  m1->getObjectPtr()->number_of_samples;
+  int line = m1->getObjectPtr()->idx.kspace_encode_step_1;
+  int partition = m1->getObjectPtr()->idx.kspace_encode_step_2;
+  int slice = m1->getObjectPtr()->idx.slice;
+
+  if (samples > static_cast<int>(dimensions_[0])) {
+	  GADGET_DEBUG1("Wrong number of samples received\n");
+	  return GADGET_FAIL;
+  }
+
+  size_t offset= 0;
+  //Copy the data for all the channels
+  for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+    offset = 
+      slice*dimensions_[0]*dimensions_[1]*dimensions_[2]*dimensions_[3] +
+      c*dimensions_[0]*dimensions_[1]*dimensions_[2] +
+      partition*dimensions_[0]*dimensions_[1] +
+      line*dimensions_[0] + (dimensions_[0]>>1)-m1->getObjectPtr()->center_sample;
+    
+    memcpy(b+offset,
+    	d+c*samples,
+    	sizeof(std::complex<float>)*samples);
+  }
+  
+  bool is_last_scan_in_slice = ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+  
+  if (is_last_scan_in_slice) {
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 = 
+      new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+    
+    cm1->getObjectPtr()->flags = 0;
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* cm2 = 
+      new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
+    
+    cm1->cont(cm2);
+    
+    std::vector<size_t> img_dims(4);
+    img_dims[0] = dimensions_[0];
+    img_dims[1] = dimensions_[1];
+    img_dims[2] = dimensions_[2];
+    img_dims[3] = dimensions_[3];
+    
+    try{cm2->getObjectPtr()->create(&img_dims);}
+    catch (std::runtime_error &err){
+      GADGET_DEBUG_EXCEPTION(err,"Unable to allocate new image array\n");
+      cm1->release();
+      return -1;
+    }
+    
+    size_t data_length = dimensions_[0]*dimensions_[1]*
+    		dimensions_[2]*dimensions_[3];
+    
+    offset = slice*data_length;
+    
+    memcpy(cm2->getObjectPtr()->get_data_ptr(),b+offset,
+	   sizeof(std::complex<float>)*data_length);
+    
+    cm1->getObjectPtr()->matrix_size[0]     = img_dims[0];
+    cm1->getObjectPtr()->matrix_size[1]     = img_dims[1];
+    cm1->getObjectPtr()->matrix_size[2]     = img_dims[2];
+    cm1->getObjectPtr()->field_of_view[0]   = field_of_view_[0];
+    cm1->getObjectPtr()->field_of_view[1]   = field_of_view_[1];
+    cm1->getObjectPtr()->field_of_view[2]   = field_of_view_[2];
+    cm1->getObjectPtr()->channels           = img_dims[3];
+    cm1->getObjectPtr()->slice   = m1->getObjectPtr()->idx.slice;
+
+    memcpy(cm1->getObjectPtr()->position,
+    		m1->getObjectPtr()->position,
+	   sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->read_dir,
+                m1->getObjectPtr()->read_dir,
+           sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->phase_dir,
+                m1->getObjectPtr()->phase_dir,
+           sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->slice_dir,
+                m1->getObjectPtr()->slice_dir,
+           sizeof(float)*3);
+
+    memcpy(cm1->getObjectPtr()->patient_table_position,
+    		m1->getObjectPtr()->patient_table_position, sizeof(float)*3);
+
+    cm1->getObjectPtr()->image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+    cm1->getObjectPtr()->image_index = ++image_counter_;
+    cm1->getObjectPtr()->image_series_index = image_series_;
+
+    if (this->next()->putq(cm1) < 0) {
+    	return GADGET_FAIL;
+    }
+  } 
+
+  m1->release();
+  return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(AccumulatorGadget)
+}
diff --git a/gadgets/mri_core/AccumulatorGadget.h b/gadgets/mri_core/AccumulatorGadget.h
new file mode 100644
index 0000000..d0fa9dc
--- /dev/null
+++ b/gadgets/mri_core/AccumulatorGadget.h
@@ -0,0 +1,36 @@
+#ifndef ACCUMULATORGADGET_H
+#define ACCUMULATORGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  class EXPORTGADGETSMRICORE AccumulatorGadget : 
+  public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+      
+    public:
+      GADGET_DECLARE(AccumulatorGadget);
+      
+      AccumulatorGadget();
+      ~AccumulatorGadget();
+      
+    protected:
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+      
+      hoNDArray< std::complex<float> >* buffer_;
+      std::vector<size_t> dimensions_;
+      std::vector<float> field_of_view_;
+      size_t slices_;
+      long long image_counter_;
+      long long image_series_;
+    };
+}
+#endif //ACCUMULATORGADGET_H
diff --git a/gadgets/mri_core/AcquisitionFinishGadget.cpp b/gadgets/mri_core/AcquisitionFinishGadget.cpp
new file mode 100644
index 0000000..740a625
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionFinishGadget.cpp
@@ -0,0 +1,27 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetMessageInterface.h"
+#include "AcquisitionFinishGadget.h"
+#include "GadgetStreamController.h"
+
+using namespace Gadgetron;
+
+int AcquisitionFinishGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+				 GadgetContainerMessage< NDArray< std::complex<float> > >* m2)
+{
+  if (!controller_) {
+    ACE_DEBUG( (LM_DEBUG, ACE_TEXT("Cannot return result to controller, no controller set")) );
+    return -1;
+  }
+
+  GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+    new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+  mb->getObjectPtr()->id = GADGET_MESSAGE_ACQUISITION;
+
+  mb->cont(m1);
+
+  return controller_->output_ready(mb);
+
+}
+
+GADGET_FACTORY_DECLARE(AcquisitionFinishGadget)
diff --git a/gadgets/mri_core/AcquisitionFinishGadget.h b/gadgets/mri_core/AcquisitionFinishGadget.h
new file mode 100644
index 0000000..99fccba
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionFinishGadget.h
@@ -0,0 +1,26 @@
+#ifndef ACQUISITIONFINISHGADGET_H
+#define ACQUISITIONFINISHGADGET_H
+
+#include "Gadget.h"
+#include "NDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE AcquisitionFinishGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader, NDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(AcquisitionFinishGadget);
+      
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< NDArray< std::complex<float> > >* m2);
+    };
+}
+
+#endif //ACQUISITIONFINISHGADGET_H
diff --git a/gadgets/mri_core/AcquisitionPassthroughGadget.cpp b/gadgets/mri_core/AcquisitionPassthroughGadget.cpp
new file mode 100644
index 0000000..6c91588
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionPassthroughGadget.cpp
@@ -0,0 +1,23 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "AcquisitionPassthroughGadget.h"
+#include "Gadgetron.h"
+namespace Gadgetron{
+int AcquisitionPassthroughGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+  //It is enough to put the first one, since they are linked
+  if (this->next()->putq(m1) == -1) {
+    m1->release();
+    ACE_ERROR_RETURN( (LM_ERROR,
+		       ACE_TEXT("%p\n"),
+		       ACE_TEXT("AcquisitionPassthroughGadget::process, passing data on to next gadget")),
+		      -1);
+  }
+
+  return 0;
+}
+GADGET_FACTORY_DECLARE(AcquisitionPassthroughGadget)
+}
+
+
diff --git a/gadgets/mri_core/AcquisitionPassthroughGadget.h b/gadgets/mri_core/AcquisitionPassthroughGadget.h
new file mode 100644
index 0000000..fe56836
--- /dev/null
+++ b/gadgets/mri_core/AcquisitionPassthroughGadget.h
@@ -0,0 +1,24 @@
+#ifndef ACQUISITIONPASSTHROUGHGADGET_H
+#define ACQUISITIONPASSTHROUGHGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE AcquisitionPassthroughGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(AcquisitionPassthroughGadget);
+      
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+    };
+}
+#endif //ACQUISITIONPASSTHROUGHGADGET_H
diff --git a/gadgets/mri_core/AutoScaleGadget.cpp b/gadgets/mri_core/AutoScaleGadget.cpp
new file mode 100644
index 0000000..d77d82a
--- /dev/null
+++ b/gadgets/mri_core/AutoScaleGadget.cpp
@@ -0,0 +1,74 @@
+/*
+ * AutoScaleGadget.cpp
+ *
+ *  Created on: Dec 19, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "AutoScaleGadget.h"
+
+namespace Gadgetron{
+
+AutoScaleGadget::AutoScaleGadget()
+	: histogram_bins_(100)
+	, current_scale_(1.0)
+	, max_value_(2048)
+{
+}
+
+AutoScaleGadget::~AutoScaleGadget() {
+	// TODO Auto-generated destructor stub
+}
+
+int AutoScaleGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<hoNDArray<float> > *m2)
+{
+	if (m1->getObjectPtr()->image_type == ISMRMRD::TYPE_MAGNITUDE) { //Only scale magnitude images for now
+		float max = 0.0f;
+		float* d = m2->getObjectPtr()->get_data_ptr();
+		for (unsigned long int i = 0; i < m2->getObjectPtr()->get_number_of_elements(); i++) {
+			if (d[i] > max) max = d[i];
+		}
+
+		if (histogram_.size() != histogram_bins_) {
+			histogram_ = std::vector<size_t>(histogram_bins_);
+		}
+
+		for (size_t i = 0; i < histogram_bins_; i++) {
+			histogram_[i] = 0;
+		}
+
+		for (unsigned long int i = 0; i < m2->getObjectPtr()->get_number_of_elements(); i++) {
+			size_t bin = static_cast<size_t>(floor((d[i]/max)*histogram_bins_));
+			if (bin >= histogram_bins_) {
+				bin = histogram_bins_-1;
+			}
+			histogram_[bin]++;
+		}
+
+		//Find 99th percentile
+		long cumsum = 0;
+		size_t counter = 0;
+		while (cumsum < (0.99*m2->getObjectPtr()->get_number_of_elements())) {
+			cumsum += histogram_[counter++];
+		}
+		max = (counter+1)*(max/histogram_bins_);
+
+		current_scale_ = max_value_/max;
+
+		for (unsigned long int i = 0; i < m2->getObjectPtr()->get_number_of_elements(); i++) {
+			d[i] *= current_scale_;
+		}
+	}
+
+	if (this->next()->putq(m1) < 0) {
+		GADGET_DEBUG1("Failed to pass on data to next Gadget\n");
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(AutoScaleGadget)
+
+}
diff --git a/gadgets/mri_core/AutoScaleGadget.h b/gadgets/mri_core/AutoScaleGadget.h
new file mode 100644
index 0000000..2bbda2a
--- /dev/null
+++ b/gadgets/mri_core/AutoScaleGadget.h
@@ -0,0 +1,32 @@
+#ifndef AUTOSCALEGADGET_H_
+#define AUTOSCALEGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE AutoScaleGadget:
+    public Gadget2<ISMRMRD::ImageHeader,hoNDArray< float > >
+  {
+  public:
+    GADGET_DECLARE(AutoScaleGadget);
+
+    AutoScaleGadget();
+    virtual ~AutoScaleGadget();
+
+  protected:
+    virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			GadgetContainerMessage< hoNDArray< float > >* m2);
+
+    unsigned int histogram_bins_;
+    std::vector<size_t> histogram_;
+    float current_scale_;
+    float max_value_;
+  };
+}
+
+#endif /* AUTOSCALEGADGET_H_ */
diff --git a/gadgets/mri_core/CMakeLists.txt b/gadgets/mri_core/CMakeLists.txt
new file mode 100644
index 0000000..09336be
--- /dev/null
+++ b/gadgets/mri_core/CMakeLists.txt
@@ -0,0 +1,115 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_MRICORE__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+if (MKL_FOUND)
+    # This is a fix for the bug in SVD when MKL is multi-threaded
+    MESSAGE("MKL Found, enabling MKL for mri_core gadgets.")
+    add_definitions(-DHAVE_MKL)
+    # These are needed to get the linking to work properly when
+    # MKL is installed, but Armadillo is NOT using it.
+    list(APPEND EXTRA_MKL_LIBRARIES mkl_core)
+    list(APPEND EXTRA_MKL_LIBRARIES mkl_intel_thread)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+  ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+  ${ARMADILLO_INCLUDE_DIRS}
+  ${MKL_INCLUDE_DIR}
+)
+
+if (ARMADILLO_FOUND)
+  list(APPEND OPTIMIZED_GADGETS NoiseAdjustGadget.cpp)
+  list(APPEND OPTIMIZED_GADGETS PCACoilGadget.cpp)
+  list(APPEND OPTIMIZED_GADGET_HEADERS NoiseAdjustGadget.h)
+  list(APPEND OPTIMIZED_GADGET_HEADERS PCACoilGadget.h)
+  list(APPEND OPTIMIZED_GADGET_LIBS cpucore_math ${ARMADILLO_LIBRARIES})
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, i.e. not compiling Blas/Lapack optimized MRI Gadgets")
+endif (ARMADILLO_FOUND)
+
+add_library(gadgetron_mricore SHARED 
+  GadgetMRIHeaders.h 
+  AcquisitionPassthroughGadget.h AcquisitionPassthroughGadget.cpp
+  AcquisitionFinishGadget.h AcquisitionFinishGadget.cpp 
+  AccumulatorGadget.h AccumulatorGadget.cpp
+  FFTGadget.h FFTGadget.cpp
+  ImageFinishGadget.h ImageFinishGadget.cpp
+  CropAndCombineGadget.h CropAndCombineGadget.cpp
+  ImageWriterGadget.h ImageWriterGadget.cpp
+  MRIImageWriter.h MRIImageWriter.cpp
+  NoiseAdjustGadget_unoptimized.h NoiseAdjustGadget_unoptimized.cpp
+  ExtractGadget.h ExtractGadget.cpp
+  FloatToUShortGadget.h FloatToUShortGadget.cpp
+  RemoveROOversamplingGadget.h RemoveROOversamplingGadget.cpp
+  CoilReductionGadget.h CoilReductionGadget.cpp
+  AutoScaleGadget.h AutoScaleGadget.cpp
+  FlowPhaseSubtractionGadget.h FlowPhaseSubtractionGadget.cpp
+  GadgetIsmrmrdReadWrite.h GadgetIsmrmrdReadWrite.cpp
+  PhysioInterpolationGadget.h PhysioInterpolationGadget.cpp
+  IsmrmrdDumpGadget.h IsmrmrdDumpGadget.cpp
+  PartialFourierAdjustROGadget.h PartialFourierAdjustROGadget.cpp
+  MaxwellCorrectionGadget.h MaxwellCorrectionGadget.cpp
+  CplxDumpGadget.h CplxDumpGadget.cpp
+  ${OPTIMIZED_GADGETS}
+  ${OPTIMIZED_GADGET_HEADERS}
+  ${ISMRMRD_XSD_SOURCE}
+  )
+
+target_link_libraries(gadgetron_mricore 
+  cpucore
+  ${ISMRMRD_LIBRARIES} 
+  ${FFTW3_LIBRARIES} 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+  ${BOOST_LIBRARIES}
+  ${XERCESC_LIBRARIES} 
+  ${OPTIMIZED_GADGET_LIBS}
+  ${MKL_LIBRARIES}
+  ${EXTRA_MKL_LIBRARIES}
+  )
+
+install(FILES 
+  AccumulatorGadget.h
+  AcquisitionFinishGadget.h
+  AcquisitionPassthroughGadget.h
+  CropAndCombineGadget.h
+  ExtractGadget.h
+  FloatToUShortGadget.h
+  FFTGadget.h
+  GadgetMRIHeaders.h
+  ImageFinishGadget.h
+  ImageWriterGadget.h
+  MRIImageWriter.h
+  MaxwellCorrectionGadget.h
+  NoiseAdjustGadget.h
+  RemoveROOversamplingGadget.h
+  CoilReductionGadget.h
+  AutoScaleGadget.h
+  FlowPhaseSubtractionGadget.h
+  GadgetIsmrmrdReadWrite.h
+  PhysioInterpolationGadget.h
+  IsmrmrdDumpGadget.h
+  CplxDumpGadget.h
+  MaxwellCorrectionGadget.h
+  PartialFourierAdjustROGadget.h
+  gadgetron_mricore_export.h
+  ${OPTIMIZED_GADGET_HEADERS}
+  DESTINATION include)
+
+if (ARMADILLO_FOUND)
+  install(FILES default.xml default_short.xml default_optimized.xml DESTINATION config)
+endif (ARMADILLO_FOUND)
+
+install(FILES ismrmrd_dump.xml DESTINATION config)
+
+install(TARGETS gadgetron_mricore DESTINATION lib)
+install(FILES ${ISMRMRD_SCHEMA_DIR}/ismrmrd.xsd DESTINATION schema)
diff --git a/gadgets/mri_core/CoilReductionGadget.cpp b/gadgets/mri_core/CoilReductionGadget.cpp
new file mode 100644
index 0000000..2b84313
--- /dev/null
+++ b/gadgets/mri_core/CoilReductionGadget.cpp
@@ -0,0 +1,123 @@
+/*
+* CoilReductionGadget.cpp
+*
+*  Created on: Dec 5, 2011
+*      Author: hansenms
+*/
+
+#include "CoilReductionGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+
+namespace Gadgetron{
+
+    CoilReductionGadget::CoilReductionGadget() {
+    }
+
+    CoilReductionGadget::~CoilReductionGadget() {
+    }
+
+    int CoilReductionGadget::process_config(ACE_Message_Block *mb)
+    {
+        boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+        coils_in_ = cfg->acquisitionSystemInformation().get().receiverChannels().present() ? cfg->acquisitionSystemInformation().get().receiverChannels().get() : 128;
+
+        boost::shared_ptr<std::string> coil_mask = this->get_string_value("coil_mask");
+
+        if (coil_mask->compare(std::string("")) == 0) {
+            int coils_out = this->get_int_value("coils_out");
+            if (coils_out <= 0) {
+                GADGET_DEBUG2("Invalid number of output coils %d\n", coils_out);
+                return GADGET_FAIL;
+            }
+            coil_mask_ = std::vector<unsigned short>(coils_out,1);
+        } else {
+            std::vector<std::string> chm;
+            boost::split(chm, *coil_mask, boost::is_any_of(" "));
+            for (size_t i = 0; i < chm.size(); i++) {
+                std::string ch = boost::algorithm::trim_copy(chm[i]);
+                if (ch.size() > 0) {
+                    size_t mv = static_cast<size_t>(ACE_OS::atoi(ch.c_str()));
+                    //GADGET_DEBUG2("Coil mask value: %d\n", mv);
+                    if (mv > 0) {
+                        coil_mask_.push_back(1);
+                    } else {
+                        coil_mask_.push_back(0);
+                    }
+                }
+            }
+        }
+
+        while (coil_mask_.size() < coils_in_) coil_mask_.push_back(0);
+        while (coil_mask_.size() > coils_in_) coil_mask_.pop_back();
+
+        if (coil_mask_.size() != coils_in_) {
+            GADGET_DEBUG1("Error configuring coils for coil reduction\n");
+            return GADGET_FAIL;
+        }
+
+        coils_out_ = 0;
+        for (size_t i = 0; i < coil_mask_.size(); i++) {
+            if (coil_mask_[i]) coils_out_++;
+        }
+
+        GADGET_DEBUG2("Coil reduction from %d to %d\n", coils_in_, coils_out_);
+
+        return GADGET_OK;
+    }
+
+
+    int CoilReductionGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1, GadgetContainerMessage<hoNDArray<std::complex<float> > > *m2)
+    {
+        std::vector<size_t> dims_out(2);
+        dims_out[0] = m1->getObjectPtr()->number_of_samples;
+        dims_out[1] = coils_out_;
+
+        GadgetContainerMessage< hoNDArray<std::complex<float> > >* m3 =
+            new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+        try{ m3->getObjectPtr()->create(&dims_out);}
+        catch (std::runtime_error &err){
+            GADGET_DEBUG_EXCEPTION(err,"Unable to create storage for reduced dataset size\n");
+            return GADGET_FAIL;
+        }
+
+        std::complex<float>* s = m2->getObjectPtr()->get_data_ptr();
+        std::complex<float>* d = m3->getObjectPtr()->get_data_ptr();
+        size_t samples =  m1->getObjectPtr()->number_of_samples;
+        size_t coils_copied = 0;
+        for (int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+            if (c > coil_mask_.size()) {
+                GADGET_DEBUG1("Fatal error, too many coils for coil mask\n");
+                m3->release();
+                return GADGET_FAIL;
+            }
+            if (coil_mask_[c]) {
+                memcpy(d+coils_copied*samples,s+c*samples,sizeof(std::complex<float>)*samples);
+                coils_copied++;
+            }
+        }
+
+        m1->cont(m3);
+	
+	//In case trajectories are attached
+	m3->cont(m2->cont());
+	m2->cont(0);
+
+        m2->release();
+
+        m1->getObjectPtr()->active_channels = coils_out_;
+	
+        if( this->next()->putq(m1) < 0 ){
+	  GADGET_DEBUG1("Failed to put message on queue\n");
+	  return GADGET_FAIL;
+	}
+	
+	return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(CoilReductionGadget)
+}
diff --git a/gadgets/mri_core/CoilReductionGadget.h b/gadgets/mri_core/CoilReductionGadget.h
new file mode 100644
index 0000000..3654ab8
--- /dev/null
+++ b/gadgets/mri_core/CoilReductionGadget.h
@@ -0,0 +1,32 @@
+#ifndef COILREDUCTIONGADGET_H_
+#define COILREDUCTIONGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+class EXPORTGADGETSMRICORE CoilReductionGadget :
+  public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(CoilReductionGadget);
+      
+      CoilReductionGadget();
+      virtual ~CoilReductionGadget();
+      
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+      
+    protected:
+      std::vector<unsigned short> coil_mask_;
+      unsigned int coils_in_;
+      unsigned int coils_out_;      
+    };
+}
+#endif /* COILREDUCTIONGADGET_H_ */
diff --git a/gadgets/mri_core/CplxDumpGadget.cpp b/gadgets/mri_core/CplxDumpGadget.cpp
new file mode 100644
index 0000000..0ddcd7e
--- /dev/null
+++ b/gadgets/mri_core/CplxDumpGadget.cpp
@@ -0,0 +1,139 @@
+#include "CplxDumpGadget.h"
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_utils.h"
+#include "Gadgetron.h"
+
+namespace Gadgetron{
+
+  CplxDumpGadget::CplxDumpGadget() 
+    : Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >()
+    , buffer_(ACE_Message_Queue_Base::DEFAULT_HWM * 10000, ACE_Message_Queue_Base::DEFAULT_LWM * 10000)
+  {
+    set_parameter(std::string("filename").c_str(), "profiles.cplx");
+  }
+
+  CplxDumpGadget::~CplxDumpGadget() {}
+
+  int CplxDumpGadget::process_config(ACE_Message_Block* mb)
+  {
+    filename_ = *get_string_value("filename");
+    return GADGET_OK;
+  }
+
+  int CplxDumpGadget::close(unsigned long flags) {
+    
+    GADGET_DEBUG1("CplxDumpGadget::close...\n");
+    GADGET_DEBUG2("Number of items on Q: %d\n", buffer_.message_count());
+
+    int ret = Gadget::close(flags);
+    unsigned int readouts_buffered = buffer_.message_count();
+
+    if( readouts_buffered == 0 )
+      return GADGET_OK;
+    
+    // Get the array size from the dimensions of the first buffer entry
+    //
+
+    ACE_Message_Block* mbq;
+    if (buffer_.dequeue_head(mbq) < 0) {
+      GADGET_DEBUG1("Message dequeue failed\n");
+      return GADGET_FAIL;
+    }
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+    
+    if (!daq) {
+      GADGET_DEBUG1("Unable to interpret data on message queue\n");
+      return GADGET_FAIL;
+    }
+
+    hoNDArray< std::complex<float> > *entry = daq->getObjectPtr();
+    std::vector<size_t> dims_profile = *entry->get_dimensions();
+    std::vector<size_t> dims = dims_profile;
+    dims.push_back(readouts_buffered);
+
+    // Allocate array for result
+    //
+
+    hoNDArray< std::complex<float> > result( &dims );
+
+    // And copy over the first profile
+    //
+
+    {
+      hoNDArray< std::complex<float> > tmp( &dims_profile, result.get_data_ptr() );
+      tmp = *entry;
+    }
+
+    mbq->release();
+    
+    // Copy the remaining profiles to the array
+    //
+    
+    for (unsigned int i = 1; i < readouts_buffered; i++) {
+      
+      if (buffer_.dequeue_head(mbq) < 0) {
+        GADGET_DEBUG1("Message dequeue failed\n");
+        return GADGET_FAIL;
+      }
+      
+      daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+      
+      if (!daq) {
+        GADGET_DEBUG1("Unable to interpret data on message queue\n");
+        return GADGET_FAIL;
+      }
+      
+      entry = daq->getObjectPtr();
+      hoNDArray< std::complex<float> > tmp( &dims_profile, result.get_data_ptr()+i*entry->get_number_of_elements() );
+      tmp = *entry;
+      mbq->release();
+    }      
+  
+    // Reshape to get the coil dimension as the last
+    //
+  
+    std::vector<size_t> order; order.push_back(0); order.push_back(2); order.push_back(1);
+    result = *permute( &result, &order);
+
+    // Write out the result
+    //
+  
+    write_nd_array< std::complex<float> >( &result, filename_.c_str() );
+  
+    return GADGET_OK;
+  }
+  
+  int CplxDumpGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+    
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+    
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+    
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* copy = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+    *copy->getObjectPtr() = *m2->getObjectPtr();
+    
+    if (buffer_.enqueue_tail(copy) < 0) {
+      GADGET_DEBUG1("Failed to add profile to buffer\n");
+      copy->release();
+      return GADGET_FAIL;
+    }
+    
+    if (this->next()->putq(m1) < 0) {
+      GADGET_DEBUG1("Unable to put data on queue\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(CplxDumpGadget)
+}
diff --git a/gadgets/mri_core/CplxDumpGadget.h b/gadgets/mri_core/CplxDumpGadget.h
new file mode 100644
index 0000000..1b37b46
--- /dev/null
+++ b/gadgets/mri_core/CplxDumpGadget.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <ismrmrd_hdf5.h>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE CplxDumpGadget : 
+  public Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(CplxDumpGadget);
+
+      CplxDumpGadget();
+      ~CplxDumpGadget();
+
+    protected:
+      virtual int process_config(ACE_Message_Block* mb);
+
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      virtual int close(unsigned long flags); //All the work is done here in this Gadget
+
+    private:
+      std::string filename_;
+      ACE_Message_Queue<ACE_MT_SYNCH> buffer_;
+    };
+}
diff --git a/gadgets/mri_core/CropAndCombineGadget.cpp b/gadgets/mri_core/CropAndCombineGadget.cpp
new file mode 100644
index 0000000..0b446fd
--- /dev/null
+++ b/gadgets/mri_core/CropAndCombineGadget.cpp
@@ -0,0 +1,70 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "CropAndCombineGadget.h"
+
+namespace Gadgetron{
+int CropAndCombineGadget::
+process( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	 GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+
+  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 = 
+    new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+  std::vector<size_t> new_dimensions(3);
+  new_dimensions[0] = m2->getObjectPtr()->get_size(0)>>1;
+  new_dimensions[1] = m2->getObjectPtr()->get_size(1);
+  new_dimensions[2] = m2->getObjectPtr()->get_size(2);
+
+  try{m3->getObjectPtr()->create(&new_dimensions);}
+  catch (std::runtime_error &err){
+  	GADGET_DEBUG_EXCEPTION(err,"CropAndCombineGadget, failed to allocate new array\n");
+    return -1;
+  }
+
+  int dimx     = m3->getObjectPtr()->get_size(0);
+  int dimx_old = m2->getObjectPtr()->get_size(0);
+
+  int dimy = m3->getObjectPtr()->get_size(1);
+  int dimz = m3->getObjectPtr()->get_size(2);
+
+  int channels = m2->getObjectPtr()->get_size(3);
+
+  std::complex<float>* d1 = m2->getObjectPtr()->get_data_ptr();
+  std::complex<float>* d2 = m3->getObjectPtr()->get_data_ptr();
+
+  size_t img_block_old = dimx_old*dimy*dimz;
+
+  for (int z = 0; z < dimz; z++) {
+    for (int y = 0; y < dimy; y++) {
+      for (int x = 0; x < dimx; x++) {
+	float mag = 0;
+	float phase = 0;
+	size_t offset_1 = z*dimy*dimx_old+y*dimx_old+x+((dimx_old-dimx)>>1);
+	size_t offset_2 = z*dimy*dimx+y*dimx+x;
+	for (int c = 0; c < channels; c++) {
+	  float mag_tmp = norm(d1[offset_1 + c*img_block_old]);
+	  phase += mag_tmp*arg(d1[offset_1 + c*img_block_old]);
+	  mag += mag_tmp;
+	}
+
+	d2[offset_2] = std::polar(std::sqrt(mag),phase);
+      }
+    }
+  }
+
+  //Now add the new array to the outgoing message
+  m1->cont(m3);
+  m2->release();
+
+  //Modify header to match
+  m1->getObjectPtr()->matrix_size[0] = m1->getObjectPtr()->matrix_size[0]>>1;
+  m1->getObjectPtr()->channels = 1;
+
+  m1->getObjectPtr()->field_of_view[0] = m1->getObjectPtr()->field_of_view[0]/2;
+
+  return this->next()->putq(m1);
+}
+
+GADGET_FACTORY_DECLARE(CropAndCombineGadget)
+}
diff --git a/gadgets/mri_core/CropAndCombineGadget.h b/gadgets/mri_core/CropAndCombineGadget.h
new file mode 100644
index 0000000..dda1ed6
--- /dev/null
+++ b/gadgets/mri_core/CropAndCombineGadget.h
@@ -0,0 +1,25 @@
+#ifndef CROPANDCOMBINEGADGET_H
+#define CROPANDCOMBINEGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  class EXPORTGADGETSMRICORE CropAndCombineGadget : 
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(CropAndCombineGadget);
+      
+    protected:
+      virtual int process( GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);     
+    };
+}
+
+#endif //CROPANDCOMBINEGADGET_H
diff --git a/gadgets/mri_core/ExtractGadget.cpp b/gadgets/mri_core/ExtractGadget.cpp
new file mode 100644
index 0000000..0418466
--- /dev/null
+++ b/gadgets/mri_core/ExtractGadget.cpp
@@ -0,0 +1,117 @@
+/*
+ * ExtractMagnitudeGadget.cpp
+ *
+ *  Created on: Nov 8, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ExtractGadget.h"
+
+
+namespace Gadgetron{
+ExtractGadget::ExtractGadget()
+: extract_mask_(GADGET_EXTRACT_MAGNITUDE)
+{
+
+}
+
+ExtractGadget::~ExtractGadget()
+{
+
+}
+
+int ExtractGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<hoNDArray<std::complex<float> > > *m2)
+{
+	int em = this->get_int_value("extract_mask");
+	if (em > 0) {
+		if (em < GADGET_EXTRACT_MAX ) {
+			extract_mask_ = static_cast<unsigned short>(em);
+		}
+	}
+
+	static int counter = 0;
+	for (size_t m = GADGET_EXTRACT_MAGNITUDE; m < GADGET_EXTRACT_MAX; m = m<<1) {
+		if (extract_mask_ & m) {
+			GadgetContainerMessage<ISMRMRD::ImageHeader>* cm1 =
+					new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+			//Copy the header
+			*cm1->getObjectPtr() = *m1->getObjectPtr();
+
+			GadgetContainerMessage<hoNDArray< float > > *cm2 =
+					new GadgetContainerMessage<hoNDArray< float > >();
+
+			boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+			try{cm2->getObjectPtr()->create(dims.get());}
+			catch (std::runtime_error &err){
+				GADGET_DEBUG_EXCEPTION(err,"Unable to create unsigned short storage in Extract Magnitude Gadget");
+				return GADGET_FAIL;
+			}
+
+			std::complex<float>* src = m2->getObjectPtr()->get_data_ptr();
+			float* dst = cm2->getObjectPtr()->get_data_ptr();
+
+			float pix_val;
+			for (unsigned long i = 0; i < cm2->getObjectPtr()->get_number_of_elements(); i++) {
+				switch (m) {
+				case GADGET_EXTRACT_MAGNITUDE:
+					pix_val = abs(src[i]);
+					break;
+				case GADGET_EXTRACT_REAL:
+					pix_val = real(src[i]);
+					break;
+				case GADGET_EXTRACT_IMAG:
+					pix_val = imag(src[i]);
+					break;
+				case GADGET_EXTRACT_PHASE:
+					pix_val = arg(src[i]);
+					break;
+				default:
+					GADGET_DEBUG2("Unexpected extract mask %d, bailing out\n", m);
+					return GADGET_FAIL;
+				}
+				dst[i] = pix_val;
+			}
+
+			cm1->cont(cm2);
+			cm1->getObjectPtr()->image_data_type = ISMRMRD::DATA_FLOAT;//GADGET_IMAGE_REAL_FLOAT;
+
+			switch (m) {
+			case GADGET_EXTRACT_MAGNITUDE:
+				cm1->getObjectPtr()->image_type = ISMRMRD::TYPE_MAGNITUDE;//GADGET_IMAGE_MAGNITUDE;
+				break;
+			case GADGET_EXTRACT_REAL:
+				cm1->getObjectPtr()->image_type = ISMRMRD::TYPE_REAL;
+				cm1->getObjectPtr()->image_series_index += 1000; //Ensure that this will go in a different series
+				break;
+			case GADGET_EXTRACT_IMAG:
+				cm1->getObjectPtr()->image_type = ISMRMRD::TYPE_IMAG;
+				cm1->getObjectPtr()->image_series_index += 2000; //Ensure that this will go in a different series
+				break;
+			case GADGET_EXTRACT_PHASE:
+				cm1->getObjectPtr()->image_type = ISMRMRD::TYPE_PHASE;
+				cm1->getObjectPtr()->image_series_index += 3000; //Ensure that this will go in a different series
+				break;
+			default:
+				GADGET_DEBUG2("Unexpected extract mask %d, bailing out\n", m);
+				break;
+			}
+
+
+			if (this->next()->putq(cm1) == -1) {
+				m1->release();
+				GADGET_DEBUG1("Unable to put extracted images on next gadgets queue");
+				return GADGET_FAIL;
+			}
+		}
+	}
+
+	m1->release(); //We have copied all the data in this case
+	return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(ExtractGadget)
+}
diff --git a/gadgets/mri_core/ExtractGadget.h b/gadgets/mri_core/ExtractGadget.h
new file mode 100644
index 0000000..35e046d
--- /dev/null
+++ b/gadgets/mri_core/ExtractGadget.h
@@ -0,0 +1,62 @@
+#ifndef EXTRACTGADGET_H_
+#define EXTRACTGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+#define MAX_UNSIGNED_SHORT_IMAGE_VALUE
+
+//Extract flags
+#define GADGET_EXTRACT_NONE                   (0)      //0
+#define GADGET_EXTRACT_MAGNITUDE              (1 << 0) //1
+#define GADGET_EXTRACT_REAL                   (1 << 1) //2
+#define GADGET_EXTRACT_IMAG                   (1 << 2) //4
+#define GADGET_EXTRACT_PHASE                  (1 << 3) //8
+#define GADGET_EXTRACT_MAX                    (1 << 4) //16
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE ExtractGadget:
+  public Gadget2<ISMRMRD::ImageHeader,hoNDArray< std::complex<float> > >
+    {
+
+    public:
+      GADGET_DECLARE(ExtractGadget);
+
+      ExtractGadget();
+      virtual ~ExtractGadget();
+
+      void set_extract_mask(unsigned short mask) {
+	extract_mask_ = mask;
+      }
+
+      bool extract_magnitude() {
+	return (extract_mask_ & GADGET_EXTRACT_MAGNITUDE);
+      }
+
+      bool extract_real() {
+	return (extract_mask_ & GADGET_EXTRACT_REAL);
+      }
+
+      bool extract_imag() {
+	return (extract_mask_ & GADGET_EXTRACT_IMAG);
+      }
+
+      bool extract_phase() {
+	return (extract_mask_ & GADGET_EXTRACT_PHASE);
+      }
+
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+      unsigned short extract_mask_;
+    };
+}
+
+#endif /* EXTRACTGADGET_H_ */
diff --git a/gadgets/mri_core/FFTGadget.cpp b/gadgets/mri_core/FFTGadget.cpp
new file mode 100644
index 0000000..cc06e6b
--- /dev/null
+++ b/gadgets/mri_core/FFTGadget.cpp
@@ -0,0 +1,22 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "FFTGadget.h"
+#include "hoNDFFT.h"
+
+namespace Gadgetron{
+
+  int FFTGadget::process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+    hoNDFFT<float>::instance()->ifft(m2->getObjectPtr(),0);
+    hoNDFFT<float>::instance()->ifft(m2->getObjectPtr(),1);
+    hoNDFFT<float>::instance()->ifft(m2->getObjectPtr(),2);
+    
+    if (this->next()->putq(m1) < 0) {
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;    
+  }
+  
+  GADGET_FACTORY_DECLARE(FFTGadget)
+}
diff --git a/gadgets/mri_core/FFTGadget.h b/gadgets/mri_core/FFTGadget.h
new file mode 100644
index 0000000..6031ba4
--- /dev/null
+++ b/gadgets/mri_core/FFTGadget.h
@@ -0,0 +1,24 @@
+#ifndef FFTGADGET_H
+#define FFTGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE FFTGadget : 
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(FFTGadget)
+	
+	protected:
+      virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);      
+    };
+}
+#endif //FFTGADGET_H
diff --git a/gadgets/mri_core/FloatToUShortGadget.cpp b/gadgets/mri_core/FloatToUShortGadget.cpp
new file mode 100644
index 0000000..512650f
--- /dev/null
+++ b/gadgets/mri_core/FloatToUShortGadget.cpp
@@ -0,0 +1,83 @@
+/*
+ * FloatToUShortGadget.cpp
+ *
+ *  Created on: Nov 26, 2011
+ *      Author: hansenms
+ */
+
+#include "GadgetIsmrmrdReadWrite.h"
+#include "FloatToUShortGadget.h"
+namespace Gadgetron{
+FloatToUShortGadget::FloatToUShortGadget()
+{
+}
+
+
+
+FloatToUShortGadget::~FloatToUShortGadget()
+{
+}
+
+
+
+int FloatToUShortGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<hoNDArray<float> > *m2)
+{
+
+	GadgetContainerMessage<hoNDArray< ACE_UINT16 > > *cm2 =
+			new GadgetContainerMessage<hoNDArray< ACE_UINT16 > >();
+
+	boost::shared_ptr< std::vector<size_t> > dims = m2->getObjectPtr()->get_dimensions();
+
+	try {cm2->getObjectPtr()->create(dims);}
+	catch (std::runtime_error &err){
+		GADGET_DEBUG_EXCEPTION(err,"Unable to create unsigned short storage in Extract Magnitude Gadget");
+		return GADGET_FAIL;
+	}
+
+	float* src = m2->getObjectPtr()->get_data_ptr();
+	ACE_UINT16* dst = cm2->getObjectPtr()->get_data_ptr();
+
+	for (unsigned long i = 0; i < cm2->getObjectPtr()->get_number_of_elements(); i++) {
+		float pix_val = src[i];
+		switch (m1->getObjectPtr()->image_type) {
+		case ISMRMRD::TYPE_MAGNITUDE:
+			pix_val = std::abs(pix_val);
+			if (pix_val > 4095) pix_val = 4095;
+			break;
+		case ISMRMRD::TYPE_REAL:
+		case ISMRMRD::TYPE_IMAG:
+			pix_val = pix_val + 2048;
+			if (pix_val < 0) pix_val = 0;
+			if (pix_val > 4095) pix_val = 4095;
+			break;
+		case ISMRMRD::TYPE_PHASE:
+			pix_val *= 2048.0/3.14159265;
+			pix_val += 2048;
+			if (pix_val < 0) pix_val = 0;
+			if (pix_val > 4095) pix_val = 4095;
+			break;
+		default:
+			GADGET_DEBUG2("Unknown image type %d, bailing out\n",m1->getObjectPtr()->image_type);
+			m1->release();
+			cm2->release();
+			return GADGET_FAIL;
+		}
+		dst[i] = static_cast<unsigned short>(pix_val);
+	}
+
+	m1->cont(cm2);
+	m2->release();
+	m1->getObjectPtr()->image_data_type = ISMRMRD::DATA_UNSIGNED_SHORT;
+
+	if (this->next()->putq(m1) == -1) {
+		m1->release();
+		GADGET_DEBUG1("Unable to put unsigned short magnitude image on next gadgets queue");
+		return GADGET_FAIL;
+	}
+
+
+	return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(FloatToUShortGadget)
+}
diff --git a/gadgets/mri_core/FloatToUShortGadget.h b/gadgets/mri_core/FloatToUShortGadget.h
new file mode 100644
index 0000000..c1900ad
--- /dev/null
+++ b/gadgets/mri_core/FloatToUShortGadget.h
@@ -0,0 +1,35 @@
+#ifndef FLOATTOUSHORTGADGET_H_
+#define FLOATTOUSHORTGADGET_H_
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+
+namespace Gadgetron{
+  
+  /**
+   * This Gadget converts float values to unsigned unsigned short int format.
+   *
+   * How the conversion is done will depend on the image type:
+   * Magnitude images: Values above 4095 will be clamped.
+   * Real or Imag: Values below -2048 and above 2047 will be clamped. Zero will be 2048.
+   * Phase: -pi will be 0, +pi will be 4095.
+   *
+   */
+  class EXPORTGADGETSMRICORE FloatToUShortGadget:
+  public Gadget2<ISMRMRD::ImageHeader,hoNDArray< float > >
+    {
+    public:
+      GADGET_DECLARE(FloatToUShortGadget);
+      FloatToUShortGadget();
+      virtual ~FloatToUShortGadget();
+      
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< float > >* m2);      
+    };
+}
+
+#endif /* FLOATTOUSHORTGADGET_H_ */
diff --git a/gadgets/mri_core/FlowPhaseSubtractionGadget.cpp b/gadgets/mri_core/FlowPhaseSubtractionGadget.cpp
new file mode 100644
index 0000000..90ab8bd
--- /dev/null
+++ b/gadgets/mri_core/FlowPhaseSubtractionGadget.cpp
@@ -0,0 +1,150 @@
+#include "FlowPhaseSubtractionGadget.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  FlowPhaseSubtractionGadget::FlowPhaseSubtractionGadget() {}
+
+  FlowPhaseSubtractionGadget::~FlowPhaseSubtractionGadget() {}
+
+  int FlowPhaseSubtractionGadget::process_config(ACE_Message_Block* mb)
+  {
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    std::vector<long> dims;
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    sets_ = e_limits.set().present() ? e_limits.set().get().maximum() + 1 : 1;
+
+    if (sets_ > 2) {
+      GADGET_DEBUG1("Phase subtraction only implemented for two sets for now\n");
+      GADGET_DEBUG2("Number of sets detected: %d, bailing out.\n", sets_);
+      return GADGET_FAIL;
+    }
+
+    buffer_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[sets_]); 
+
+    size_t bsize = sizeof(GadgetContainerMessage< GadgetContainerMessage<ISMRMRD::ImageHeader> >)*10000;
+
+    for( size_t i=0; i<sets_; i++ ){
+      buffer_[i].high_water_mark(bsize);
+      buffer_[i].low_water_mark(bsize);
+    }
+
+    return GADGET_OK;
+  }
+
+  int FlowPhaseSubtractionGadget::
+  process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+
+    // We need two sets to make a phase subtraction
+    if (sets_ < 2) {
+      if (this->next()->putq(m1) < 0) {
+	return GADGET_FAIL;
+      }
+      return GADGET_OK;
+    }
+
+    size_t set = m1->getObjectPtr()->set;
+
+    // Enqueue until we have images from both sets
+    //
+
+    if( buffer_[set].enqueue_tail(m1) < 0 ){
+      GADGET_DEBUG1("Message enqueue failed\n");
+      return GADGET_FAIL;
+    };
+
+    // Phase subtract 
+    //
+
+    while( buffer_[0].message_count()>0 && buffer_[1].message_count()>0 ) {
+
+      ACE_Message_Block *mbq1, *mbq2;
+
+      if( buffer_[0].dequeue_head(mbq1) < 0 || buffer_[1].dequeue_head(mbq2) < 0 ) {
+	GADGET_DEBUG1("Message dequeue failed\n");
+	if( buffer_[set].message_count() > 0 ) 
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	return GADGET_FAIL;
+      }
+	
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *pm1 = 
+	AsContainerMessage<ISMRMRD::ImageHeader>(mbq1);
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cpm1 = 
+	AsContainerMessage<hoNDArray< std::complex<float> > >(mbq1->cont());
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *pm2 = 
+	AsContainerMessage<ISMRMRD::ImageHeader>(mbq2);
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cpm2 = 
+	AsContainerMessage<hoNDArray< std::complex<float> > >(mbq2->cont());
+	
+      // Some validity checks
+      //
+
+      if( pm1->getObjectPtr()->image_index != pm2->getObjectPtr()->image_index ) {
+	GADGET_DEBUG2("Mismatch in image indices detected (%d, %d). Bailing out.\n", 
+		      pm1->getObjectPtr()->image_index, pm2->getObjectPtr()->image_index);
+	pm1->release();
+	if( buffer_[set].message_count() > 0 ){
+	  pm2->release();		
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	}
+	return GADGET_FAIL;
+      }
+      
+      if (cpm1->getObjectPtr()->get_number_of_elements() != cpm2->getObjectPtr()->get_number_of_elements()) {
+	GADGET_DEBUG1("Mismatch in number of elements detected. Bailing out.\n");
+	pm1->release();
+	if( buffer_[set].message_count() > 0 ){
+	  pm2->release();
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	}
+	return GADGET_FAIL;
+      }
+
+      std::complex<float> *p1 = cpm1->getObjectPtr()->get_data_ptr();
+      std::complex<float> *p2 = cpm2->getObjectPtr()->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long i = 0; i < (long)m2->getObjectPtr()->get_number_of_elements(); i++ ) {
+	std::complex<float> tmp = std::polar((std::abs(p1[i])+std::abs(p2[i]))/2.0f, std::arg(p2[i])-std::arg(p1[i]));
+	p2[i] = tmp;
+      }
+      
+      pm1->release();	
+      pm2->getObjectPtr()->set = 0;
+
+      if (this->next()->putq(pm2) < 0) {
+	if( buffer_[set].message_count() > 0 ) {
+	  pm2->release();
+	  buffer_[set].dequeue_tail(mbq1); // or m1 will be attempted deleted twice
+	}
+	return GADGET_FAIL;
+      }
+    }
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(FlowPhaseSubtractionGadget)
+}
diff --git a/gadgets/mri_core/FlowPhaseSubtractionGadget.h b/gadgets/mri_core/FlowPhaseSubtractionGadget.h
new file mode 100644
index 0000000..9d1c1fa
--- /dev/null
+++ b/gadgets/mri_core/FlowPhaseSubtractionGadget.h
@@ -0,0 +1,38 @@
+#ifndef FlowPhaseSubtractionGadget_H
+#define FlowPhaseSubtractionGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+  
+    class EXPORTGADGETSMRICORE FlowPhaseSubtractionGadget :
+        public Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+
+    public:
+        GADGET_DECLARE(FlowPhaseSubtractionGadget);
+
+        FlowPhaseSubtractionGadget();
+        virtual ~FlowPhaseSubtractionGadget();
+
+    protected:
+        virtual int process_config(ACE_Message_Block* mb);
+
+        virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+            GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+
+    private:
+        unsigned int sets_;
+	boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > buffer_;
+    };
+}
+
+#endif //FlowPhaseSubtractionGadget_H
diff --git a/gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp b/gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp
new file mode 100644
index 0000000..ae8d80c
--- /dev/null
+++ b/gadgets/mri_core/GadgetIsmrmrdReadWrite.cpp
@@ -0,0 +1,6 @@
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron{
+
+GADGETRON_READER_FACTORY_DECLARE(GadgetIsmrmrdAcquisitionMessageReader)
+}
diff --git a/gadgets/mri_core/GadgetIsmrmrdReadWrite.h b/gadgets/mri_core/GadgetIsmrmrdReadWrite.h
new file mode 100644
index 0000000..87eb3d6
--- /dev/null
+++ b/gadgets/mri_core/GadgetIsmrmrdReadWrite.h
@@ -0,0 +1,202 @@
+#ifndef GADGETISMRMRDREADWRITE_H
+#define GADGETISMRMRDREADWRITE_H
+
+#include "Gadgetron.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetMessageInterface.h"
+#include "hoNDArray.h"
+#include "url_encode.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#ifndef EXCLUDE_ISMRMRD_XSD
+#include <ismrmrd.hxx>
+#endif
+
+#include <ace/SOCK_Stream.h>
+#include <ace/Task.h>
+#include <complex>
+
+namespace Gadgetron{
+
+    class EXPORTGADGETSMRICORE GadgetIsmrmrdAcquisitionMessageWriter : public GadgetMessageWriter
+    {
+
+    public:
+        virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+        {
+            GadgetContainerMessage<ISMRMRD::Acquisition>* acqmb =
+                dynamic_cast< GadgetContainerMessage<ISMRMRD::Acquisition>* >(mb);
+
+            if (!acqmb) {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), GadgetAcquisitionMessageWriter, invalid acquisition message objects")) );
+                return -1;
+            }
+
+            ssize_t send_cnt = 0;
+
+            GadgetMessageIdentifier id;
+            id.id = GADGET_MESSAGE_ISMRMRD_ACQUISITION;
+
+            if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+                ACE_DEBUG ((LM_ERROR,
+                    ACE_TEXT ("(%P|%t) Unable to send acquisition message identifier\n")));
+
+                return -1;
+            }
+
+            if ((send_cnt = sock->send_n (&acqmb->getObjectPtr()->getHead(), sizeof(ISMRMRD::AcquisitionHeader))) <= 0) {
+                ACE_DEBUG ((LM_ERROR,
+                    ACE_TEXT ("(%P|%t) Unable to send acquisition header\n")));
+
+                return -1;
+            }
+
+            unsigned long trajectory_elements = acqmb->getObjectPtr()->getHead().trajectory_dimensions*acqmb->getObjectPtr()->getHead().number_of_samples;
+            unsigned long data_elements = acqmb->getObjectPtr()->getHead().active_channels*acqmb->getObjectPtr()->getHead().number_of_samples;
+
+            if (trajectory_elements) {
+                if ((send_cnt = sock->send_n (&acqmb->getObjectPtr()->getTraj()[0], sizeof(float)*trajectory_elements)) <= 0) {
+                    ACE_DEBUG ((LM_ERROR,
+                        ACE_TEXT ("(%P|%t) Unable to send acquisition trajectory elements\n")));
+
+                    return -1;
+                }
+            }
+
+            if (data_elements) {
+                if ((send_cnt = sock->send_n (&acqmb->getObjectPtr()->getData()[0], 2*sizeof(float)*data_elements)) <= 0) {
+                    ACE_DEBUG ((LM_ERROR,
+                        ACE_TEXT ("(%P|%t) Unable to send acquisition data elements\n")));
+
+                    return -1;
+                }
+            }
+
+            return 0;
+        }
+    };
+
+    /**
+    Default implementation of GadgetMessageReader for IsmrmrdAcquisition messages
+    */
+    class EXPORTGADGETSMRICORE GadgetIsmrmrdAcquisitionMessageReader : public GadgetMessageReader
+    {
+
+    public:
+        GADGETRON_READER_DECLARE(GadgetIsmrmrdAcquisitionMessageReader);
+
+        virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream)
+        {
+
+            GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1 =
+                new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+
+            GadgetContainerMessage<hoNDArray< std::complex<float> > >* m2 =
+                new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+            m1->cont(m2);
+
+            ssize_t recv_count = 0;
+
+            if ((recv_count = stream->recv_n(m1->getObjectPtr(), sizeof(ISMRMRD::AcquisitionHeader))) <= 0) {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetIsmrmrdAcquisitionMessageReader, failed to read ISMRMRDACQ Header\n")) );
+                m1->release();
+                return 0;
+            }
+
+            if (m1->getObjectPtr()->trajectory_dimensions) {
+                GadgetContainerMessage<hoNDArray< float > >* m3 =
+                    new GadgetContainerMessage< hoNDArray< float > >();
+
+                m2->cont(m3);
+
+                std::vector<size_t> tdims;
+                tdims.push_back(m1->getObjectPtr()->trajectory_dimensions);
+                tdims.push_back(m1->getObjectPtr()->number_of_samples);
+
+                try { m3->getObjectPtr()->create(&tdims);}
+                catch (std::runtime_error &err){
+                    GADGET_DEBUG_EXCEPTION(err,"(%P|%t) Allocate trajectory data\n");
+                    m1->release();
+
+                    return 0;
+                }
+
+                if ((recv_count =
+                    stream->recv_n
+                    (m3->getObjectPtr()->get_data_ptr(),
+                    sizeof(float)*tdims[0]*tdims[1])) <= 0) {
+
+                        ACE_DEBUG ((LM_ERROR,
+                            ACE_TEXT ("(%P|%t) Unable to read trajectory data\n")));
+
+                        m1->release();
+
+                        return 0;
+                }
+
+            }
+
+            std::vector<size_t> adims;
+            adims.push_back(m1->getObjectPtr()->number_of_samples);
+            adims.push_back(m1->getObjectPtr()->active_channels);
+
+            try{ m2->getObjectPtr()->create(&adims); }
+            catch (std::runtime_error &err ){
+                GADGET_DEBUG_EXCEPTION(err,"(%P|%t) Allocate sample data\n")
+                    m1->release();
+
+                return 0;
+            }
+
+            if ((recv_count =
+                stream->recv_n
+                (m2->getObjectPtr()->get_data_ptr(),
+                sizeof(std::complex<float>)*adims[0]*adims[1])) <= 0) {
+
+                    ACE_DEBUG ((LM_ERROR,
+                        ACE_TEXT ("(%P|%t) Unable to read Acq data\n")));
+
+                    m1->release();
+
+                    return 0;
+            }
+
+            return m1;
+        }
+
+    };
+
+#ifndef EXCLUDE_ISMRMRD_XSD
+    inline boost::shared_ptr<ISMRMRD::ismrmrdHeader> parseIsmrmrdXMLHeader(std::string xml) {
+        char * gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+        ACE_TCHAR schema_file_name[4096];
+        ACE_OS::sprintf(schema_file_name, "%s/schema/ismrmrd.xsd", gadgetron_home);
+
+        std::string tmp(schema_file_name);
+        tmp = url_encode(tmp);
+        ACE_OS_String::strncpy(schema_file_name,tmp.c_str(), 4096);
+
+        xml_schema::properties props;
+        props.schema_location (
+            "http://www.ismrm.org/ISMRMRD",
+            std::string (schema_file_name));
+
+
+        std::istringstream str_stream(xml, std::stringstream::in);
+
+        boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg;
+
+        try {
+            cfg = boost::shared_ptr<ISMRMRD::ismrmrdHeader>(ISMRMRD::ismrmrdHeader_ (str_stream,0,props));
+        }  catch (const xml_schema::exception& e) {
+            GADGET_DEBUG2("Failed to parse XML Parameters: %s\n", e.what());
+        }
+
+        return cfg;
+    }
+#endif
+}
+#endif //GADGETISMRMRDREADWRITE_H
diff --git a/gadgets/mri_core/GadgetMRIHeaders.h b/gadgets/mri_core/GadgetMRIHeaders.h
new file mode 100644
index 0000000..f235df7
--- /dev/null
+++ b/gadgets/mri_core/GadgetMRIHeaders.h
@@ -0,0 +1,128 @@
+#ifndef GADGETMRIHEADERS_H
+#define GADGETMRIHEADERS_H
+
+#include <ace/Basic_Types.h>
+
+//Data flags
+/*
+#define GADGET_FLAG_ACQ_END                   (1 << 0)
+#define GADGET_FLAG_LAST_ACQ_IN_SLICE         (1 << 1)
+#define GADGET_FLAG_LAST_ACQ_IN_MEAS          (1 << 2)
+#define GADGET_FLAG_LAST_ACQ_IN_CONCAT        (1 << 3)
+#define GADGET_FLAG_FIRST_ACQ_IN_SLICE        (1 << 4)
+#define GADGET_FLAG_FIRST_ACQ_IN_MEAS         (1 << 5)
+#define GADGET_FLAG_FIRST_ACQ_IN_CONCAT       (1 << 6)
+#define GADGET_FLAG_IS_NOISE_SCAN             (1 << 7)
+#define GADGET_FLAG_IS_PATREF_SCAN            (1 << 8)
+#define GADGET_FLAG_IS_PATREFANDIMA_SCAN      (1 << 9)
+
+#define GADGET_FLAG_LAST_IMAGE                (1 << 0)
+
+enum GadgetImageFormats {
+	GADGET_IMAGE_COMPLEX_FLOAT = 0,
+	GADGET_IMAGE_REAL_FLOAT,
+	GADGET_IMAGE_REAL_UNSIGNED_SHORT
+};
+
+enum GadgetImageTypes
+{
+	GADGET_IMAGE_MAGNITUDE = 0,
+	GADGET_IMAGE_PHASE,
+	GADGET_IMAGE_REAL,
+	GADGET_IMAGE_IMAG
+};
+*/
+
+namespace Gadgetron{
+
+enum GadgetMessageID {
+  GADGET_MESSAGE_EXT_ID_MIN                = 1000,
+  GADGET_MESSAGE_ACQUISITION               = 1001, /**< DEPRECATED */
+  GADGET_MESSAGE_NEW_MEASUREMENT           = 1002, /**< DEPRECATED */
+  GADGET_MESSAGE_END_OF_SCAN               = 1003, /**< DEPRECATED */
+  GADGET_MESSAGE_IMAGE_CPLX_FLOAT          = 1004, /**< DEPRECATED */
+  GADGET_MESSAGE_IMAGE_REAL_FLOAT          = 1005, /**< DEPRECATED */
+  GADGET_MESSAGE_IMAGE_REAL_USHORT         = 1006, /**< DEPRECATED */
+  GADGET_MESSAGE_EMPTY                     = 1007, /**< DEPRECATED */
+  GADGET_MESSAGE_ISMRMRD_ACQUISITION       = 1008,
+  GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT  = 1009,
+  GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT  = 1010,
+  GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT = 1011,
+  GADGET_MESSAGE_DICOM                     = 1012,
+  GADGET_MESSAGE_CLOUD_JOB                 = 1013,
+  GADGET_MESSAGE_GADGETCLOUD_JOB           = 1014,
+  GADGET_MESSAGE_EXT_ID_MAX                = 4096
+};
+
+
+/*
+struct ISMRMRD::ImageHeader
+{
+  ACE_UINT32     flags;
+  ACE_UINT16     matrix_size[3];
+  ACE_UINT16     channels;
+  float          position[3];
+  float          quaternion[4];
+  float			 table_position;
+  ACE_UINT16     slice;
+  ACE_UINT16     contrast;
+  ACE_UINT16     set;
+  ACE_UINT16     phase;
+  ACE_UINT16     average;
+  ACE_UINT16     repetition;
+  ACE_UINT32     time_stamp;
+  ACE_UINT32     pmu_time_stamp;
+  ACE_UINT16     image_format;
+  ACE_UINT16     image_type;
+  ACE_UINT16     image_index;
+  ACE_UINT16	 image_series_index;
+
+  ACE_UINT16 get_matrix_size(unsigned int index) {
+    if (index < 3) {
+      return matrix_size[index];
+    } else {
+      return 0;
+    }
+  }
+
+  void set_matrix_size(unsigned int index, ACE_UINT16 size) {
+    if (index < 3) {
+      matrix_size[index] = size;
+    }
+  }
+
+  float get_position(unsigned int index) {
+    if (index < 3) {
+      return position[index];
+    } else {
+      return 0.0f;
+    }
+  }
+
+  void set_position(unsigned int index, float pos)
+  {
+    if (index < 3) {
+      position[index] = pos;
+    }
+  }
+
+  float get_quaternion(unsigned int index) {
+    if (index < 4) {
+      return quaternion[index];
+    } else {
+      return 0.0f;
+    }
+  }
+
+  void set_quaternion(unsigned int index, float quar)
+  {
+    if (index < 4) {
+      quaternion[index] = quar;
+    }
+  }
+
+}; 
+*/
+}
+
+#endif  //GADGETMRIHEADERS_H
diff --git a/gadgets/mri_core/ImageFinishGadget.cpp b/gadgets/mri_core/ImageFinishGadget.cpp
new file mode 100644
index 0000000..c5b1752
--- /dev/null
+++ b/gadgets/mri_core/ImageFinishGadget.cpp
@@ -0,0 +1,51 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ImageFinishGadget.h"
+namespace Gadgetron{
+template <typename T>
+int ImageFinishGadget<T>
+::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< T > >* m2)
+{
+  if (!this->controller_) {
+    ACE_DEBUG( (LM_DEBUG, 
+		ACE_TEXT("Cannot return result to controller, no controller set")) );
+    return -1;
+  }
+
+  GadgetContainerMessage<GadgetMessageIdentifier>* mb =
+    new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+  switch (sizeof(T)) {
+  case 2: //Unsigned short
+	  mb->getObjectPtr()->id = GADGET_MESSAGE_IMAGE_REAL_USHORT;
+	  break;
+  case 4: //Float
+	  mb->getObjectPtr()->id = GADGET_MESSAGE_IMAGE_REAL_FLOAT;
+	  break;
+  case 8: //Complex float
+	  mb->getObjectPtr()->id = GADGET_MESSAGE_IMAGE_CPLX_FLOAT;
+	  break;
+  default:
+	  GADGET_DEBUG2("Wrong data size detected: %d\n", sizeof(T));
+	  mb->release();
+	  m1->release();
+	  return GADGET_FAIL;
+  }
+
+  mb->cont(m1);
+
+  int ret =  this->controller_->output_ready(mb);
+
+  if ( (ret < 0) ) {
+	  GADGET_DEBUG1("Failed to return massage to controller\n");
+	  return GADGET_FAIL;
+  }
+
+  return GADGET_OK;
+}
+
+//Declare factories for the various template instances
+GADGET_FACTORY_DECLARE(ImageFinishGadgetFLOAT)
+GADGET_FACTORY_DECLARE(ImageFinishGadgetUSHORT)
+GADGET_FACTORY_DECLARE(ImageFinishGadgetCPLX)
+}
diff --git a/gadgets/mri_core/ImageFinishGadget.h b/gadgets/mri_core/ImageFinishGadget.h
new file mode 100644
index 0000000..da86e96
--- /dev/null
+++ b/gadgets/mri_core/ImageFinishGadget.h
@@ -0,0 +1,45 @@
+#ifndef IMAGEFINISHGADGET_H
+#define IMAGEFINISHGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "GadgetStreamController.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  template <typename T> class EXPORTGADGETSMRICORE ImageFinishGadget : 
+  public Gadget2<ISMRMRD::ImageHeader,hoNDArray< T > >
+  {
+  protected:
+    virtual int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1, 
+			GadgetContainerMessage< hoNDArray< T > >* m2);
+  };
+  
+  class EXPORTGADGETSMRICORE ImageFinishGadgetUSHORT :
+  public ImageFinishGadget<ACE_UINT16>
+  {
+  public:
+    GADGET_DECLARE(ImageFinishGadgetUSHORT);
+  };
+
+  class EXPORTGADGETSMRICORE ImageFinishGadgetFLOAT :
+  public ImageFinishGadget<float>
+  {
+  public:
+    GADGET_DECLARE(ImageFinishGadgetFLOAT);
+  };
+
+  class EXPORTGADGETSMRICORE ImageFinishGadgetCPLX :
+  public ImageFinishGadget< std::complex<float> >
+  {
+  public:
+    GADGET_DECLARE(ImageFinishGadgetCPLX);
+  };
+}
+
+#endif //IMAGEFINISHGADGET_H
diff --git a/gadgets/mri_core/ImageWriterGadget.cpp b/gadgets/mri_core/ImageWriterGadget.cpp
new file mode 100644
index 0000000..1f3ecde
--- /dev/null
+++ b/gadgets/mri_core/ImageWriterGadget.cpp
@@ -0,0 +1,52 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "ImageWriterGadget.h"
+
+#include <fstream>
+namespace Gadgetron{
+template<typename T>
+int ImageWriterGadget<T> ::
+process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+	 GadgetContainerMessage< hoNDArray< T > >* m2)
+{
+    GADGET_DEBUG1("Writing image\n");
+
+    char filename[1024];
+    switch (sizeof(T)) {
+     case (8): //Complex float
+     	sprintf(filename, "out_%05d.cplx", (int)this->calls_);
+     	break;
+     case (4): //Real floats
+ 		sprintf(filename, "out_%05d.real", (int)this->calls_);
+ 		break;
+     case (2): //Unsigned short
+ 		sprintf(filename, "out_%05d.short", (int)this->calls_);
+ 		break;
+     default:
+     	sprintf(filename, "out_%05d.cplx", (int)this->calls_);
+     	break;
+     }
+
+    std::ofstream outfile;    
+    outfile.open (filename, std::ios::out|std::ios::binary);
+
+    int ndim = m2->getObjectPtr()->get_number_of_dimensions();
+    int* dims = new int[ndim];
+    size_t elements = 1;
+    for (int d = 0; d < ndim; d++) {
+      dims[d] = m2->getObjectPtr()->get_size(d);
+      elements *= dims[d];
+    }
+    outfile.write((char*)&ndim,sizeof(int));
+    outfile.write((char*)dims,sizeof(int)*ndim);
+    outfile.write((char*)m2->getObjectPtr()->get_data_ptr(),sizeof(T)*elements);
+    outfile.close();
+    delete [] dims;
+
+    this->calls_++;
+    return this->next()->putq(m1);
+}
+
+GADGET_FACTORY_DECLARE(ImageWriterGadgetUSHORT)
+GADGET_FACTORY_DECLARE(ImageWriterGadgetFLOAT)
+GADGET_FACTORY_DECLARE(ImageWriterGadgetCPLX)
+}
diff --git a/gadgets/mri_core/ImageWriterGadget.h b/gadgets/mri_core/ImageWriterGadget.h
new file mode 100644
index 0000000..9ee52eb
--- /dev/null
+++ b/gadgets/mri_core/ImageWriterGadget.h
@@ -0,0 +1,50 @@
+#ifndef IMAGEWRITERGADGET_H
+#define IMAGEWRITERGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+  
+  template <typename T> class ImageWriterGadget :
+  public Gadget2<ISMRMRD::ImageHeader, hoNDArray< T > >
+  {
+    public:
+      
+    ImageWriterGadget()
+      : calls_(0)
+	{}
+      
+    protected:
+      virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader>* m1,
+			   GadgetContainerMessage< hoNDArray< T > >* m2);
+      
+      long calls_;      
+  };
+  
+  class EXPORTGADGETSMRICORE ImageWriterGadgetUSHORT :
+  public ImageWriterGadget<ACE_UINT16>
+  {
+  public:
+    GADGET_DECLARE(ImageWriterGadgetUSHORT)
+  };
+
+  class EXPORTGADGETSMRICORE ImageWriterGadgetFLOAT :
+  public ImageWriterGadget<float>
+  {
+  public:
+    GADGET_DECLARE(ImageWriterGadgetFLOAT)
+  };
+
+  class EXPORTGADGETSMRICORE ImageWriterGadgetCPLX :
+  public ImageWriterGadget< std::complex<float> >
+  {
+  public:
+    GADGET_DECLARE(ImageWriterGadgetCPLX)
+  };
+}
+#endif //IMAGEWRITERGADGET_H
diff --git a/gadgets/mri_core/IsmrmrdDumpGadget.cpp b/gadgets/mri_core/IsmrmrdDumpGadget.cpp
new file mode 100644
index 0000000..7d55225
--- /dev/null
+++ b/gadgets/mri_core/IsmrmrdDumpGadget.cpp
@@ -0,0 +1,134 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "IsmrmrdDumpGadget.h"
+#include "Gadgetron.h"
+namespace Gadgetron{
+
+
+  std::string get_date_time_string()
+  {
+    time_t rawtime;
+    struct tm * timeinfo;
+    time ( &rawtime );
+    timeinfo = localtime ( &rawtime );
+
+    
+    std::stringstream str;
+    str << timeinfo->tm_year+1900
+	<< std::setw(2) << std::setfill('0') << timeinfo->tm_mon+1
+	<< std::setw(2) << std::setfill('0') << timeinfo->tm_mday
+	<< "-"
+	<< std::setw(2) << std::setfill('0') << timeinfo->tm_hour
+	<< std::setw(2) << std::setfill('0') << timeinfo->tm_min
+	<< std::setw(2) << std::setfill('0') << timeinfo->tm_sec;
+    
+    std::string ret = str.str();
+    
+    return ret;
+  }
+
+  IsmrmrdDumpGadget::IsmrmrdDumpGadget()
+    : Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >()
+    , file_prefix_("ISMRMRD_DUMP")
+    , ismrmrd_file_name_("ISMRMRD_DUMP.h5") //This will be reset during configuration
+    , append_timestamp_(true)
+  {
+    set_parameter("file_prefix","ISMRMRD_DUMP",false);
+    set_parameter("append_timestamp","1",false);
+
+  }
+
+int IsmrmrdDumpGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+
+  ISMRMRD::Acquisition ismrmrd_acq;
+
+  ismrmrd_acq.setHead(*m1->getObjectPtr());
+  
+  std::valarray<float> d(reinterpret_cast<float*>(m2->getObjectPtr()->get_data_ptr()),
+			 m2->getObjectPtr()->get_number_of_elements()*2);
+
+  ismrmrd_acq.setData(d);
+
+  if (m2->cont()) {
+    //Write trajectory
+    if (ismrmrd_acq.getTrajectoryDimensions() == 0) {
+      GADGET_DEBUG1("Malformed dataset. Trajectory attached but trajectory dimensions == 0\n");
+      return GADGET_FAIL;
+    }
+    
+    GadgetContainerMessage< hoNDArray<float> >* m3 = AsContainerMessage< hoNDArray<float> >(m2->cont());
+
+    if (!m3) {
+      GADGET_DEBUG1("Error casting trajectory data package");
+      return GADGET_FAIL;
+    } 
+
+    std::valarray<float> t(m3->getObjectPtr()->get_data_ptr(),
+			   m3->getObjectPtr()->get_number_of_elements());
+    
+    ismrmrd_acq.setTraj(t);
+
+  } else {
+    if (ismrmrd_acq.getTrajectoryDimensions() != 0) {
+      GADGET_DEBUG1("Malformed dataset. Trajectory dimensions not zero but no trajectory attached\n");
+      return GADGET_FAIL;
+    }
+  }
+
+
+  {
+    ISMRMRD::HDF5Exclusive lock;
+    if (ismrmrd_dataset_->appendAcquisition(&ismrmrd_acq) < 0) {
+      GADGET_DEBUG1("Error appending ISMRMRD Dataset\n");
+      return GADGET_FAIL;
+    }
+  }
+
+
+  //It is enough to put the first one, since they are linked
+  if (this->next()->putq(m1) == -1) {
+    m1->release();
+    ACE_ERROR_RETURN( (LM_ERROR,
+		       ACE_TEXT("%p\n"),
+		       ACE_TEXT("IsmrmrdDumpGadget::process, passing data on to next gadget")),
+		      -1);
+  }
+
+  return 0;
+}
+
+int IsmrmrdDumpGadget
+::process_config(ACE_Message_Block* mb)
+{
+
+  file_prefix_ = *(get_string_value("file_prefix").get());
+  append_timestamp_ = (get_int_value("append_timestamp") > 0);
+
+  //Generate filename
+  if (append_timestamp_) {
+    ismrmrd_file_name_ = file_prefix_ + std::string("_") + get_date_time_string() + std::string(".h5");
+  } else {
+    ismrmrd_file_name_ = file_prefix_ + std::string(".h5");
+  }
+  
+  
+  ISMRMRD::HDF5Exclusive lock; //This will ensure threadsafe access to HDF5
+  ismrmrd_dataset_ = boost::shared_ptr<ISMRMRD::IsmrmrdDataset>(new ISMRMRD::IsmrmrdDataset(ismrmrd_file_name_.c_str(), "dataset"));
+ 
+  std::string xml_config(mb->rd_ptr());
+
+  if (ismrmrd_dataset_->writeHeader(xml_config) < 0 ) {
+    GADGET_DEBUG1("Failed to write XML header to HDF file\n");
+    return GADGET_FAIL;
+  }
+ 
+  return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(IsmrmrdDumpGadget)
+}
+
+
diff --git a/gadgets/mri_core/IsmrmrdDumpGadget.h b/gadgets/mri_core/IsmrmrdDumpGadget.h
new file mode 100644
index 0000000..26ea993
--- /dev/null
+++ b/gadgets/mri_core/IsmrmrdDumpGadget.h
@@ -0,0 +1,36 @@
+#ifndef ISMRMRDDUMPGADGET_H
+#define ISMRMRDDUMPGADGET_H
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <ismrmrd_hdf5.h>
+
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE IsmrmrdDumpGadget : 
+  public Gadgetron::Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(IsmrmrdDumpGadget);
+
+      IsmrmrdDumpGadget();
+
+    protected:
+      virtual int process_config(ACE_Message_Block* mb);
+
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+    private:
+      std::string file_prefix_;
+      std::string ismrmrd_file_name_;
+      boost::shared_ptr<ISMRMRD::IsmrmrdDataset>  ismrmrd_dataset_;
+      bool append_timestamp_;
+    };
+}
+#endif //ISMRMRDDUMPGADGET_H
diff --git a/gadgets/mri_core/MRIImageWriter.cpp b/gadgets/mri_core/MRIImageWriter.cpp
new file mode 100644
index 0000000..c684ce0
--- /dev/null
+++ b/gadgets/mri_core/MRIImageWriter.cpp
@@ -0,0 +1,89 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "MRIImageWriter.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+template <typename T>
+int MRIImageWriter<T>::write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb)
+{
+	GadgetContainerMessage<ISMRMRD::ImageHeader>* imagemb =
+			AsContainerMessage<ISMRMRD::ImageHeader>(mb);
+
+	if (!imagemb) {
+		ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), MRIImageWriter::write, invalid image message objects, 1\n")) );
+		return -1;
+	}
+
+	GadgetContainerMessage< hoNDArray< T > >* datamb =
+			AsContainerMessage< hoNDArray< T > >(imagemb->cont());
+
+	if (!datamb) {
+		ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), MRIImageWriter::write, invalid image message objects\n")) );
+		return -1;
+	}
+
+	ssize_t send_cnt = 0;
+	GadgetMessageIdentifier id;
+	switch (sizeof(T)) {
+	case 2: //Unsigned short
+		id.id = GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT;
+		break;
+	case 4: //Float
+		id.id = GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT;
+		break;
+	case 8: //Complex float
+		id.id = GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT;
+		break;
+	default:
+		ACE_DEBUG ((LM_ERROR,
+				ACE_TEXT ("(%P|%t) MRIImageWriter Wrong data size detected\n")));
+		return GADGET_FAIL;
+	}
+
+
+	//Let's check if the image header is consistent with the data array size before sending:
+	unsigned long expected_elements = imagemb->getObjectPtr()->matrix_size[0]*
+			imagemb->getObjectPtr()->matrix_size[1] *  imagemb->getObjectPtr()->matrix_size[2];
+
+	if (expected_elements !=  datamb->getObjectPtr()->get_number_of_elements()) {
+		GADGET_DEBUG2("Number of header elements %d is inconsistent with number of elements in NDArray %d\n",expected_elements, datamb->getObjectPtr()->get_number_of_elements());
+		GADGET_DEBUG2("Header dimensions: %d, %d, %d\n",imagemb->getObjectPtr()->matrix_size[0],imagemb->getObjectPtr()->matrix_size[1],imagemb->getObjectPtr()->matrix_size[2]);
+		GADGET_DEBUG2("Number of array dimensions: %d:\n", datamb->getObjectPtr()->get_number_of_dimensions());
+		for (size_t i = 0; i < datamb->getObjectPtr()->get_number_of_dimensions(); i++) {
+			GADGET_DEBUG2("Dimensions %d: %d\n", i, datamb->getObjectPtr()->get_size(i));
+		}
+		return -1;
+	}
+
+	if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+		ACE_DEBUG ((LM_ERROR,
+				ACE_TEXT ("(%P|%t) Unable to send image message identifier\n")));
+
+		return -1;
+	}
+
+	if ((send_cnt = sock->send_n (imagemb->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0) {
+		ACE_DEBUG ((LM_ERROR,
+				ACE_TEXT ("(%P|%t) Unable to send image header\n")));
+
+		return -1;
+	}
+
+	if ((send_cnt = sock->send_n (datamb->getObjectPtr()->get_data_ptr(), sizeof(T)*datamb->getObjectPtr()->get_number_of_elements())) <= 0) {
+		ACE_DEBUG ((LM_ERROR,
+				ACE_TEXT ("(%P|%t) Unable to send image data\n")));
+
+		return -1;
+	}
+
+	return 0;
+}
+
+GADGETRON_WRITER_FACTORY_DECLARE(MRIImageWriterFLOAT)
+GADGETRON_WRITER_FACTORY_DECLARE(MRIImageWriterUSHORT)
+GADGETRON_WRITER_FACTORY_DECLARE(MRIImageWriterCPLX)
+}
diff --git a/gadgets/mri_core/MRIImageWriter.h b/gadgets/mri_core/MRIImageWriter.h
new file mode 100644
index 0000000..ef13221
--- /dev/null
+++ b/gadgets/mri_core/MRIImageWriter.h
@@ -0,0 +1,37 @@
+#ifndef MRIIMAGEWRITER_H
+#define MRIIMAGEWRITER_H
+
+#include "GadgetMessageInterface.h"
+#include "GadgetMRIHeaders.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  template<typename T> class MRIImageWriter : public GadgetMessageWriter
+  {
+  public:
+    virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb);
+  };
+
+  class EXPORTGADGETSMRICORE MRIImageWriterUSHORT : public MRIImageWriter<ACE_UINT16>
+  {
+  public:
+    GADGETRON_WRITER_DECLARE(GadgetMessageWriterUSHORT);
+  };
+
+  class EXPORTGADGETSMRICORE MRIImageWriterFLOAT : public MRIImageWriter<float>
+  {
+  public:
+    GADGETRON_WRITER_DECLARE(GadgetMessageWriterFLOAT);
+  };
+
+  class EXPORTGADGETSMRICORE MRIImageWriterCPLX : public MRIImageWriter< std::complex<float> >
+  {
+  public:
+    GADGETRON_WRITER_DECLARE(GadgetMessageWriterCPLX);
+  };
+}
+#endif
diff --git a/gadgets/mri_core/MaxwellCorrectionGadget.cpp b/gadgets/mri_core/MaxwellCorrectionGadget.cpp
new file mode 100644
index 0000000..86bb75e
--- /dev/null
+++ b/gadgets/mri_core/MaxwellCorrectionGadget.cpp
@@ -0,0 +1,144 @@
+#include "MaxwellCorrectionGadget.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetronTimer.h"
+#include "Spline.h"
+
+#include <numeric>
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  #ifdef M_PI
+    #undef M_PI
+  #endif // M_PI
+  #define M_PI 3.14159265358979323846
+
+  MaxwellCorrectionGadget::MaxwellCorrectionGadget()
+    : maxwell_coefficients_present_(false)
+    , maxwell_coefficients_(4,0)
+  {
+  }
+
+  MaxwellCorrectionGadget::~MaxwellCorrectionGadget() {}
+
+  int MaxwellCorrectionGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Start parsing the ISMRMRD XML header
+    //
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    if( cfg.get() == 0x0 ){
+      GADGET_DEBUG1("Unable to parse Ismrmrd header\n");
+      return GADGET_FAIL;
+    }
+
+    if (cfg->userParameters().present()) {
+      for (ISMRMRD::userParameters::userParameterDouble_iterator 
+	     i (cfg->userParameters().get().userParameterDouble().begin ()); i != cfg->userParameters().get().userParameterDouble().end(); ++i) {
+	if (std::strcmp(i->name().c_str(),"MaxwellCoefficient_0") == 0) {
+	  maxwell_coefficients_[0] = i->value();
+	} else if (std::strcmp(i->name().c_str(),"MaxwellCoefficient_1") == 0) {
+	  maxwell_coefficients_[1] = i->value();
+	} else if (std::strcmp(i->name().c_str(),"MaxwellCoefficient_2") == 0) {
+	  maxwell_coefficients_[2] = i->value();
+	} else if (std::strcmp(i->name().c_str(),"MaxwellCoefficient_3") == 0) {
+	  maxwell_coefficients_[3] = i->value();
+	} else {
+	  GADGET_DEBUG2("WARNING: unused user parameter parameter %s found\n", i->name().c_str());
+	}
+      }
+    } else {
+      GADGET_DEBUG1("MaxwellCorrection coefficients are supposed to be in the UserParameters. No user parameter section found\n");
+      return GADGET_OK;
+    }
+
+    maxwell_coefficients_present_ = true;
+
+    GADGET_DEBUG2("Maxwell Coefficients: %f, %f, %f, %f\n", maxwell_coefficients_[0], maxwell_coefficients_[1], maxwell_coefficients_[2], maxwell_coefficients_[3]);
+
+    return GADGET_OK;
+  }
+
+  int MaxwellCorrectionGadget::
+  process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+    if (maxwell_coefficients_present_) {
+      //GADGET_DEBUG1("Got coefficients\n");
+
+      int Nx = m2->getObjectPtr()->get_size(0);
+      int Ny = m2->getObjectPtr()->get_size(1);
+      int Nz = m2->getObjectPtr()->get_size(2);
+
+      float dx = m1->getObjectPtr()->field_of_view[0] / Nx;
+      float dy = m1->getObjectPtr()->field_of_view[1] / Ny;
+      float dz = m1->getObjectPtr()->field_of_view[2] / Nz;
+
+      /*
+      GADGET_DEBUG2("Nx = %d, Ny = %d, Nz = %d\n", Nx, Ny, Nz);
+      GADGET_DEBUG2("dx = %f, dy = %f, dz = %f\n", dx, dy, dz);
+      GADGET_DEBUG2("img_pos_x = %f, img_pos_y = %f, img_pos_z = %f\n", m1->getObjectPtr()->position[0], m1->getObjectPtr()->position[1], m1->getObjectPtr()->position[2]);
+      */
+
+      std::vector<float> dR(3,0);
+      std::vector<float> dP(3,0);
+      std::vector<float> dS(3,0);
+      std::vector<float> p(3,0);
+
+      for (int z = 0; z < Nz; z++) {
+	for (int y = 0; y < Ny; y++) {
+	  for (int x = 0; x < Nx; x++) {
+	   
+	    dR[0] = (x-Nx/2+0.5) * dx * m1->getObjectPtr()->read_dir[0];
+	    dR[1] = (x-Nx/2+0.5) * dx * m1->getObjectPtr()->read_dir[1];
+	    dR[2] = (x-Nx/2+0.5) * dx * m1->getObjectPtr()->read_dir[2];
+	    
+	    dP[0] = (y-Ny/2+0.5) * dy * m1->getObjectPtr()->phase_dir[0];
+	    dP[1] = (y-Ny/2+0.5) * dy * m1->getObjectPtr()->phase_dir[1];
+	    dP[2] = (y-Ny/2+0.5) * dy * m1->getObjectPtr()->phase_dir[2];
+	    
+	    if (Nz > 1) {
+	      dS[0] = (z-Nz/2+0.5) * dz * m1->getObjectPtr()->slice_dir[0];
+	      dS[1] = (z-Nz/2+0.5) * dz * m1->getObjectPtr()->slice_dir[1];
+	      dS[2] = (z-Nz/2+0.5) * dz * m1->getObjectPtr()->slice_dir[2];
+	    }
+
+	    p[0] = m1->getObjectPtr()->position[0] + dP[0] + dR[0] + dS[0];
+	    p[1] = m1->getObjectPtr()->position[1] + dP[1] + dR[1] + dS[1];
+	    p[2] = m1->getObjectPtr()->position[2] + dP[1] + dR[2] + dS[2];
+
+	    //Convert to centimeters
+	    p[0] = p[0]/1000.0;
+	    p[1] = p[1]/1000.0;
+	    p[2] = p[2]/1000.0;
+
+	    float delta_phi = maxwell_coefficients_[0]*p[2]*p[2] +
+	      maxwell_coefficients_[1]*(p[0]*p[0] + p[1]*p[1]) + 
+	      maxwell_coefficients_[2]*p[0]*p[2] + 
+	      maxwell_coefficients_[3]*p[1]*p[2];
+
+	    long index = z*Ny*Nx+y*Nx+x;
+	    std::complex<float>* data_ptr = m2->getObjectPtr()->get_data_ptr();
+
+	    std::complex<float> correction = std::polar(1.0f,static_cast<float>(2*M_PI*delta_phi));
+
+	    //data_ptr[index] *= correction;
+	  }
+	}
+      }
+
+    }
+
+    if (this->next()->putq(m1) < 0) {
+      GADGET_DEBUG1("Unable to put data on next Gadgets Q\n");
+      return GADGET_FAIL;
+    }
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(MaxwellCorrectionGadget)
+}
diff --git a/gadgets/mri_core/MaxwellCorrectionGadget.h b/gadgets/mri_core/MaxwellCorrectionGadget.h
new file mode 100644
index 0000000..931eea2
--- /dev/null
+++ b/gadgets/mri_core/MaxwellCorrectionGadget.h
@@ -0,0 +1,35 @@
+#ifndef MaxwellCorrectionGadget_H
+#define MaxwellCorrectionGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{  
+
+    class EXPORTGADGETSMRICORE MaxwellCorrectionGadget :
+        public Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+
+    public:
+        GADGET_DECLARE(MaxwellCorrectionGadget);
+        MaxwellCorrectionGadget();
+        virtual ~MaxwellCorrectionGadget();
+
+
+    protected:
+        virtual int process_config(ACE_Message_Block* mb);
+        virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+            GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+	
+    private:
+	std::vector<double> maxwell_coefficients_;
+	bool maxwell_coefficients_present_;
+    };
+}
+
+#endif //MaxwellCorrectionGadget_H
diff --git a/gadgets/mri_core/NoiseAdjustGadget.cpp b/gadgets/mri_core/NoiseAdjustGadget.cpp
new file mode 100644
index 0000000..c9c6762
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget.cpp
@@ -0,0 +1,130 @@
+#include "NoiseAdjustGadget.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_elemwise.h"
+
+namespace Gadgetron{
+
+  NoiseAdjustGadget::NoiseAdjustGadget()
+  : noise_decorrelation_calculated_(false)
+  , number_of_noise_samples_(0)
+  , noise_bw_scale_factor_(1.0f)
+  , noise_dwell_time_us_(0.0f)
+  , is_configured_(false)
+  {
+  }
+
+  int NoiseAdjustGadget::process_config(ACE_Message_Block* mb)
+  {
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+ 
+    receiver_noise_bandwidth_ = cfg->acquisitionSystemInformation().get().relativeReceiverNoiseBandwidth().present() ?
+      cfg->acquisitionSystemInformation().get().relativeReceiverNoiseBandwidth().get() : 1.0;
+
+    return GADGET_OK;
+  }
+
+  int NoiseAdjustGadget
+  ::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	    GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    unsigned int channels = m1->getObjectPtr()->active_channels;
+    unsigned int samples = m1->getObjectPtr()->number_of_samples;
+
+    if (is_noise) {
+      noise_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+
+      //If noise covariance matrix is not allocated
+      if (noise_covariance_matrix_.get_number_of_elements() != channels*channels) {
+	std::vector<size_t> dims(2, channels);
+			
+	try{noise_covariance_matrix_.create(&dims);}
+	catch (std::runtime_error& err)	{
+	  GADGET_DEBUG_EXCEPTION(err, "Unable to allocate storage for noise covariance matrix\n" );
+	  return GADGET_FAIL;
+	}
+	noise_covariance_matrix_.fill(std::complex<double>(0.0,0.0));
+	number_of_noise_samples_ = 0;
+      }
+
+      std::complex<double>* cc_ptr = noise_covariance_matrix_.get_data_ptr();
+      std::complex<float>* data_ptr = m2->getObjectPtr()->get_data_ptr();
+
+      for (unsigned int s = 0; s < samples; s++) {
+	for (unsigned int i = 0; i < channels; i++) {
+	  for (unsigned int j = 0; j < channels; j++) {
+	    cc_ptr[i*channels + j] += (data_ptr[i * samples + s] * conj(data_ptr[j * samples + s]));
+	  }
+	}
+	number_of_noise_samples_++;
+      }
+
+    } else {
+      acquisition_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+      if (!is_configured_) {
+	if ((noise_dwell_time_us_ == 0.0f) || (acquisition_dwell_time_us_ == 0.0f)) {
+	  noise_bw_scale_factor_ = 1.0f;
+	} else {
+	  noise_bw_scale_factor_ = std::sqrt(2*acquisition_dwell_time_us_/noise_dwell_time_us_*receiver_noise_bandwidth_);
+	}
+
+	GADGET_DEBUG2("Noise dwell time: %f\n", noise_dwell_time_us_);
+	GADGET_DEBUG2("Acquisition dwell time: %f\n", acquisition_dwell_time_us_);
+	GADGET_DEBUG2("receiver_noise_bandwidth: %f\n", receiver_noise_bandwidth_);
+	GADGET_DEBUG2("noise_bw_scale_factor: %f\n", noise_bw_scale_factor_);
+	is_configured_ = true;
+      }
+
+      if (number_of_noise_samples_ > 0) {
+	if (!noise_decorrelation_calculated_) {
+	  GADGET_DEBUG1("Calculating noise decorrelation\n");
+	  
+	  std::vector<size_t> dims(2, channels);
+	  try{noise_covariance_matrixf_.create(&dims);}
+	  catch (std::runtime_error& err){
+	    GADGET_DEBUG_EXCEPTION(err,"Unable to allocate storage for noise covariance matrix (float)\n");
+	    return GADGET_FAIL;
+	  }
+	  
+	  // Armadillo can best do its template magic when we concatenate all the operations...
+	  // 1. scale for number of samples
+	  // 2. Cholesky decomposition
+	  // 3. Invert lower triangular
+	  // 4. Scale for noise BW
+
+	  arma::cx_mat noise_cov = as_arma_matrix(&noise_covariance_matrix_);	  
+	  arma::cx_fmat noise_covf = as_arma_matrix(&noise_covariance_matrixf_);
+
+	  {	  
+	    noise_covf = arma::conv_to<arma::cx_fmat>::from
+	      (noise_bw_scale_factor_*arma::inv(arma::trimatu(arma::chol(noise_cov/number_of_noise_samples_))));
+	  }
+	  
+	  noise_decorrelation_calculated_ = true;
+	}
+		
+	if (noise_decorrelation_calculated_) {
+	  arma::cx_fmat noise_covf = as_arma_matrix(&noise_covariance_matrixf_);
+	  arma::cx_fmat am2 = as_arma_matrix(m2->getObjectPtr());	  
+	  am2 = am2*arma::trimatu(noise_covf);
+	}
+      }
+      
+      //It is enough to put the first one, since they are linked
+      if (this->next()->putq(m1) == -1) {
+	ACE_ERROR_RETURN( (LM_ERROR,
+			   ACE_TEXT("%p\n"),
+			   ACE_TEXT("NoiseAdjustGadget::process, passing data on to next gadget")),
+			  -1);
+      }
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(NoiseAdjustGadget)
+  
+} // namespace Gadgetron
diff --git a/gadgets/mri_core/NoiseAdjustGadget.h b/gadgets/mri_core/NoiseAdjustGadget.h
new file mode 100644
index 0000000..ac72125
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron {
+
+    class EXPORTGADGETSMRICORE NoiseAdjustGadget :
+        public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+        GADGET_DECLARE(NoiseAdjustGadget);
+
+        typedef std::complex<float> ValueType;
+        typedef std::complex<double> PerwhitenerValueType;
+
+        NoiseAdjustGadget();
+
+    protected:
+        bool noise_decorrelation_calculated_;
+        hoNDArray< PerwhitenerValueType > noise_covariance_matrix_;
+        hoNDArray< ValueType > noise_covariance_matrixf_;
+        unsigned long int number_of_noise_samples_;
+        float noise_dwell_time_us_;
+        float acquisition_dwell_time_us_;
+        float noise_bw_scale_factor_;
+        float receiver_noise_bandwidth_;
+        bool is_configured_;
+        hoNDArray< ValueType > prewhitened_buf_;
+
+        virtual int process_config(ACE_Message_Block* mb);
+        virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+            GadgetContainerMessage< hoNDArray< ValueType > >* m2);
+    };
+}
diff --git a/gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp
new file mode 100644
index 0000000..b0515b4
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.cpp
@@ -0,0 +1,220 @@
+#include "NoiseAdjustGadget_unoptimized.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+void choldc(std::complex<double> *a, int n)
+{
+	int i,j,k;
+
+	for (k= 0; k < n; k++)
+	{
+		a[k*n+k] = std::complex<double>(std::sqrt(real(a[k*n+k])),0.0);
+
+		for (i = k+1; i < n; i++)
+		{
+			a[k*n+i] = a[k*n+i]/a[k*n+k];
+		}
+
+		for (j = k + 1; j < n; j++)
+		{
+			for (i = j; i < n; i++)
+			{
+				a[j*n+i] -= conj(a[k*n+j])*a[k*n+i];
+			}
+		}
+	}
+}
+
+void inv_L(std::complex<double> *a, int n)
+{
+	int i,j,k;
+
+	std::complex<double> sum;
+
+	for (i = 0; i < n; i++)
+	{
+
+		a[i*n+i] = std::complex<double>(1.0/real(a[i*n+i]),0.0);
+		for (j = i+1; j < n; j++)
+		{
+			sum = std::complex<double>(0.0,0.0);
+			for (k = i; k < j; k++)
+			{
+				sum -= a[k*n+j]*a[i*n+k];
+			}
+			a[i*n+j] = sum/a[j*n+j];
+		}
+	}
+}
+
+bool noise_decorrelation(std::complex<float>* data, int elements, int coils, std::complex<double>* inv_L_psi)
+{
+	int i,j,k;
+
+	/* We need some temporary storrage to store the data for one element before overwriting the original data */
+	std::complex<double>* tmp_data = new std::complex<double>[coils];
+
+	if (tmp_data == 0)
+	{
+		return false;
+	}
+
+	for (i = 0; i < elements; i++)
+	{
+		for (j = 0; j < coils; j++)
+		{
+			tmp_data[j] = std::complex<double>(0.0,0.0);
+		}
+
+		for (j = 0; j < coils; j++)
+		{
+			for (k = 0; k <= j; k++)
+			{
+				tmp_data[j] += inv_L_psi[k*coils+j] * static_cast< std::complex<double> >(data[k*elements+i]);
+			}
+		}
+
+		for (j = 0; j < coils; j++)
+		{
+			data[j*elements+i] = tmp_data[j];
+		}
+	}
+
+	/* Clean up */
+	delete [] tmp_data;
+
+	return true;
+}
+
+
+
+NoiseAdjustGadget_unoptimized::NoiseAdjustGadget_unoptimized()
+: noise_decorrelation_calculated_(false)
+, number_of_noise_samples_(0)
+, noise_bw_scale_factor_(1.0f)
+, is_configured_(false)
+{
+
+}
+
+
+int NoiseAdjustGadget_unoptimized::process_config(ACE_Message_Block* mb)
+{
+
+	boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+
+	receiver_noise_bandwidth_ = cfg->acquisitionSystemInformation().get().relativeReceiverNoiseBandwidth().present() ?
+								cfg->acquisitionSystemInformation().get().relativeReceiverNoiseBandwidth().get() : 1.0;
+
+
+	return GADGET_OK;
+}
+
+int NoiseAdjustGadget_unoptimized
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+		GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+	bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+	unsigned int channels = m1->getObjectPtr()->active_channels;
+	unsigned int samples = m1->getObjectPtr()->number_of_samples;
+
+	if (is_noise) {
+		noise_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+		//If noise covariance matrix is not allocated
+		if (noise_covariance_matrix_.get_number_of_elements() != channels*channels) {
+			std::vector<size_t> dims(2, channels);
+			try{ noise_covariance_matrix_.create(&dims);}
+			catch (std::runtime_error &err){
+				GADGET_DEBUG_EXCEPTION(err,"Unable to allocate storage for noise covariance matrix\n");
+				return GADGET_FAIL;
+			}
+			noise_covariance_matrix_.fill(std::complex<double>(0.0,0.0));
+
+			number_of_noise_samples_ = 0;
+		}
+
+		std::complex<double>* cc_ptr = noise_covariance_matrix_.get_data_ptr();
+		std::complex<float>* data_ptr = m2->getObjectPtr()->get_data_ptr();
+
+
+		for (unsigned int s = 0; s < samples; s++) {
+			for (unsigned int i = 0; i < channels; i++) {
+				for (unsigned int j = 0; j < channels; j++) {
+					cc_ptr[j*channels + i] += (data_ptr[i * samples + s] * conj(data_ptr[j * samples + s]));
+				}
+			}
+			number_of_noise_samples_++;
+		}
+	} else {
+		acquisition_dwell_time_us_ = m1->getObjectPtr()->sample_time_us;
+		if (!is_configured_) {
+			if ((noise_dwell_time_us_ == 0.0f) || (acquisition_dwell_time_us_ == 0.0f)) {
+				noise_bw_scale_factor_ = 1.0f;
+			} else {
+				noise_bw_scale_factor_ = std::sqrt(2*acquisition_dwell_time_us_/noise_dwell_time_us_*receiver_noise_bandwidth_);
+			}
+
+			GADGET_DEBUG2("Noise dwell time: %f\n", noise_dwell_time_us_);
+			GADGET_DEBUG2("Acquisition dwell time: %f\n", acquisition_dwell_time_us_);
+			GADGET_DEBUG2("receiver_noise_bandwidth: %f\n", receiver_noise_bandwidth_);
+			GADGET_DEBUG2("noise_bw_scale_factor: %f\n", noise_bw_scale_factor_);
+			is_configured_ = true;
+		}
+		if (number_of_noise_samples_ > 0) {
+			if (!noise_decorrelation_calculated_) {
+				GADGET_DEBUG1("Calculating noise decorrelation\n");
+				//1. scale for number of samples
+				std::complex<double>* cc_ptr = noise_covariance_matrix_.get_data_ptr();
+				for (unsigned int i = 0; i < channels*channels; i++) {
+					cc_ptr[i] /= number_of_noise_samples_;
+				}
+
+				//write_nd_array(&noise_covariance_matrix_, "CC.cplx");
+
+				//2. Cholesky decomposition
+				choldc(cc_ptr, channels);
+
+				//write_nd_array(&noise_covariance_matrix_, "CC_chol.cplx");
+
+				//3. Invert lower triangular
+				inv_L(cc_ptr, channels);
+
+				//write_nd_array(&noise_covariance_matrix_, "CC_chol_inv_L.cplx");
+
+				//4. Scale for noise BW
+				for (unsigned int i = 0; i < channels*channels; i++) {
+					cc_ptr[i] *= noise_bw_scale_factor_;
+				}
+
+				noise_decorrelation_calculated_ = true;
+			}
+
+			if (noise_decorrelation_calculated_) {
+				//Noise decorrelate
+				if (!noise_decorrelation(m2->getObjectPtr()->get_data_ptr(), samples, channels, noise_covariance_matrix_.get_data_ptr())) {
+					GADGET_DEBUG1("Noise Decorrelation Failed\n");
+					return GADGET_FAIL;
+				}
+			}
+		}
+		//It is enough to put the first one, since they are linked
+		if (this->next()->putq(m1) == -1) {
+			ACE_ERROR_RETURN( (LM_ERROR,
+					ACE_TEXT("%p\n"),
+					ACE_TEXT("NoiseAdjustGadget_unoptimized::process, passing data on to next gadget")),
+					-1);
+		}
+
+	}
+
+	return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(NoiseAdjustGadget_unoptimized)
+}
diff --git a/gadgets/mri_core/NoiseAdjustGadget_unoptimized.h b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.h
new file mode 100644
index 0000000..3bf340e
--- /dev/null
+++ b/gadgets/mri_core/NoiseAdjustGadget_unoptimized.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE NoiseAdjustGadget_unoptimized : 
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(NoiseAdjustGadget_unoptimized);
+  
+      NoiseAdjustGadget_unoptimized();
+
+    protected:
+      bool noise_decorrelation_calculated_;
+      hoNDArray< std::complex<double> > noise_covariance_matrix_;
+      unsigned long int number_of_noise_samples_;
+      float noise_dwell_time_us_;
+      float acquisition_dwell_time_us_;
+      float noise_bw_scale_factor_;
+      float receiver_noise_bandwidth_;
+      bool is_configured_;
+
+      virtual int process_config(ACE_Message_Block* mb);
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+      
+    };
+}
diff --git a/gadgets/mri_core/PCACoilGadget.cpp b/gadgets/mri_core/PCACoilGadget.cpp
new file mode 100644
index 0000000..cb36eea
--- /dev/null
+++ b/gadgets/mri_core/PCACoilGadget.cpp
@@ -0,0 +1,220 @@
+/*
+* PCACoilGadget.cpp
+*
+*  Created on: Dec 13, 2011
+*      Author: Michael S. Hansen
+*/
+
+#include "PCACoilGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "hoArmadillo.h"
+#include "hoNDArray_elemwise.h"
+
+#ifdef HAVE_MKL
+#include "mkl_service.h"
+#endif
+
+namespace Gadgetron {
+
+    PCACoilGadget::PCACoilGadget()
+        : max_buffered_profiles_(100)
+        , samples_to_use_(16)
+    {
+        // There is a bug in the MKL SVD when running in multi-threaded mode.
+        // Set the number of threads to 1 in this gadget.
+        #ifdef HAVE_MKL
+                mkl_set_num_threads(1);
+        #endif
+    }
+
+    PCACoilGadget::~PCACoilGadget()
+    {
+        std::map<int, hoNDArray<std::complex<float> >* >::iterator it;
+        it = pca_coefficients_.begin();
+        while (it != pca_coefficients_.end()) {
+            if (it->second) {
+                delete it->second;
+                it->second = 0;
+            }
+            it++;
+        }
+    }
+
+    int PCACoilGadget::process_config(ACE_Message_Block *mb)
+    {
+        return GADGET_OK;
+    }
+
+    int PCACoilGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1, GadgetContainerMessage<hoNDArray<std::complex<float> > > *m2)
+    {
+        std::map<int, bool>::iterator it;
+        int location = m1->getObjectPtr()->idx.slice;
+        bool is_last_scan_in_slice = (ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags));
+        int samples_per_profile = m1->getObjectPtr()->number_of_samples;
+        int channels = m1->getObjectPtr()->active_channels;
+
+        it = buffering_mode_.find(location);
+
+        bool is_buffering = true;
+        //Do we have an entry for this location
+        if (it != buffering_mode_.end()) {
+            is_buffering = it->second;
+        } else {
+            //else make an entry. We will always start in buffering mode for a given location.
+            buffering_mode_[location] = is_buffering;
+        }
+
+        if (is_buffering) {
+            buffer_[location].push_back(m1);
+            int profiles_available = buffer_[location].size();
+
+            //Are we ready for calculating PCA
+            if (is_last_scan_in_slice || (profiles_available >= max_buffered_profiles_)) {
+
+                //GADGET_DEBUG2("Calculating PCA coefficients with %d profiles for %d coils\n", profiles_available, channels);
+                int samples_to_use = samples_per_profile > samples_to_use_ ? samples_to_use_ : samples_per_profile;
+
+                //For some sequences there is so little data, we should just use it all.
+                if (profiles_available < 16) {
+                    samples_to_use = samples_per_profile;
+                }
+
+                int total_samples = samples_to_use*profiles_available;
+
+                std::vector<size_t> dims(2);
+                dims[0] = channels;dims[1] = total_samples;
+
+                hoNDArray< std::complex<float> > A;
+                try{ A.create(&dims); }
+                catch (std::runtime_error & err){
+                    GADGET_DEBUG1("Unable to create array for PCA calculation\n");
+                    return GADGET_FAIL;
+                }
+
+                std::complex<float>* A_ptr = A.get_data_ptr();
+                size_t sample_counter = 0;
+
+                size_t data_offset = 0;
+                if (m1->getObjectPtr()->center_sample >= (samples_to_use>>1)) {
+                    data_offset = m1->getObjectPtr()->center_sample - (samples_to_use>>1);
+                }
+
+                //GADGET_DEBUG2("Data offset = %d\n", data_offset);
+
+                hoNDArray<std::complex<float> > means;
+                std::vector<size_t> means_dims; means_dims.push_back(channels);
+
+                try{means.create(&means_dims);}
+                catch (std::runtime_error& err){
+                    GADGET_DEBUG1("Unable to create temporary stoorage for mean values\n");
+                    return GADGET_FAIL;
+                }
+
+                means.fill(std::complex<float>(0.0f,0.0f));
+
+                std::complex<float>* means_ptr = means.get_data_ptr();
+                for (size_t p = 0; p < profiles_available; p++) {
+                    GadgetContainerMessage<hoNDArray<std::complex<float> > >* m_tmp =
+                        AsContainerMessage<hoNDArray< std::complex<float> > >(buffer_[location][p]->cont());
+
+                    if (!m_tmp) {
+                        GADGET_DEBUG2("Fatal error, unable to recover data from data buffer (%d,%d)\n", p, profiles_available);
+                        return GADGET_FAIL;
+                    }
+
+                    std::complex<float>* d = m_tmp->getObjectPtr()->get_data_ptr();
+
+                    for (unsigned s = 0; s < samples_to_use; s++) {
+                        for (size_t c = 0; c < channels; c++) {
+                            //We use the conjugate of the data so that the output VT of the SVD is the actual PCA coefficient matrix
+                            A_ptr[c + sample_counter*channels] = d[c*samples_per_profile + data_offset + s];
+                            means_ptr[c] += d[c*samples_per_profile + data_offset + s];
+                        }
+
+                        sample_counter++;
+                        //GADGET_DEBUG2("Sample counter = %d/%d\n", sample_counter, total_samples);
+                    }
+                }
+
+                //Subtract off mean
+                for (size_t c = 0; c < channels; c++) {
+                    for (size_t s = 0; s < total_samples; s++) {
+                        A_ptr[c + s*channels] -=  means_ptr[c]/std::complex<float>(total_samples,0);
+                    }
+                }
+
+                //Collected data for temp matrix, now let's calculate SVD coefficients
+
+                std::vector<size_t> VT_dims;
+                VT_dims.push_back(channels);
+                VT_dims.push_back(channels);
+                pca_coefficients_[location] = new hoNDArray< std::complex<float> >;
+                hoNDArray< std::complex<float> >* VT = pca_coefficients_[location];
+
+                try {VT->create(&VT_dims);}
+                catch (std::runtime_error& err){
+                    GADGET_DEBUG_EXCEPTION(err,"Failed to create array for VT\n");
+                    return GADGET_FAIL;
+                }
+
+                arma::cx_fmat Am = as_arma_matrix(&A);
+                arma::cx_fmat Vm = as_arma_matrix(VT);
+                arma::cx_fmat Um;
+                arma::fvec Sv;
+
+                if( !arma::svd_econ(Um,Sv,Vm,Am.st(),'r') ){
+                    GADGET_DEBUG1("Failed to compute SVD\n");
+                    return GADGET_FAIL;
+                }
+
+                //Switch off buffering for this slice
+                buffering_mode_[location] = false;
+
+                //Now we should pump all the profiles that we have buffered back through the system
+                for (size_t p = 0; p < profiles_available; p++) {
+                    ACE_Message_Block* mb = buffer_[location][p];
+                    if (inherited::process(mb) != GADGET_OK) {
+                        GADGET_DEBUG1("Failed to reprocess buffered data\n");
+                        return GADGET_FAIL;
+                    }
+                }
+                //Remove references in this buffer
+                buffer_[location].clear();
+            }
+        } else {
+            //GADGET_DEBUG1("Not buffering anymore\n");
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 =
+                new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+
+            try{m3->getObjectPtr()->create(m2->getObjectPtr()->get_dimensions().get()); }
+            catch (std::runtime_error& err){
+                GADGET_DEBUG_EXCEPTION(err,"Unable to create storage for PCA coils\n");
+                m3->release();
+                return GADGET_FAIL;
+            }
+
+            if (pca_coefficients_[location] != 0) {	
+                arma::cx_fmat am3 = as_arma_matrix(m3->getObjectPtr());
+                arma::cx_fmat am2 = as_arma_matrix(m2->getObjectPtr());
+                arma::cx_fmat aPca = as_arma_matrix(pca_coefficients_[location]);
+                am3 = am2*aPca;
+            }
+
+            m1->cont(m3);
+
+            //In case there are trajectories attached. 
+            m3->cont(m2->cont());
+            m2->cont(0);
+
+            m2->release();
+
+            if (this->next()->putq(m1) < 0) {
+                GADGET_DEBUG1("Unable to put message on Q");
+                return GADGET_FAIL;
+            }
+        }
+        return GADGET_OK;
+    }
+
+    GADGET_FACTORY_DECLARE(PCACoilGadget)
+}
diff --git a/gadgets/mri_core/PCACoilGadget.h b/gadgets/mri_core/PCACoilGadget.h
new file mode 100644
index 0000000..63fcd62
--- /dev/null
+++ b/gadgets/mri_core/PCACoilGadget.h
@@ -0,0 +1,44 @@
+#ifndef PCACOILGADGET_H_
+#define PCACOILGADGET_H_
+
+#include "gadgetron_mricore_export.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+
+#include <complex>
+#include <map>
+
+namespace Gadgetron {
+
+  class EXPORTGADGETSMRICORE PCACoilGadget :
+    public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+  {
+    typedef Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > > inherited;
+  public:
+    GADGET_DECLARE(PCACoilGadget);
+
+    PCACoilGadget();
+    virtual ~PCACoilGadget();
+
+  protected:
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+  private:
+    //Map containing buffers, one for each location
+    std::map< int, std::vector< ACE_Message_Block* > > buffer_;
+
+    //Keep track of whether we are buffering for a particular location
+    std::map< int, bool> buffering_mode_;
+
+    //Map for storing PCA coefficients for each location
+    std::map<int, hoNDArray<std::complex<float> >* > pca_coefficients_;
+
+    int max_buffered_profiles_;
+    int samples_to_use_;
+  };
+}
+
+#endif /* PCACOILGADGET_H_ */
diff --git a/gadgets/mri_core/PartialFourierAdjustROGadget.cpp b/gadgets/mri_core/PartialFourierAdjustROGadget.cpp
new file mode 100644
index 0000000..8d6d53b
--- /dev/null
+++ b/gadgets/mri_core/PartialFourierAdjustROGadget.cpp
@@ -0,0 +1,145 @@
+
+#include "PartialFourierAdjustROGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+
+namespace Gadgetron
+{
+
+PartialFourierAdjustROGadget::PartialFourierAdjustROGadget() : maxRO_(0)
+{
+
+}
+
+int PartialFourierAdjustROGadget::process_config(ACE_Message_Block* mb)
+{
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1)
+    {
+        GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+        GADGET_DEBUG1("This simple partial fourier gadget only supports one encoding space\n");
+        return GADGET_FAIL;
+    }
+
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    maxRO_ = e_space.matrixSize().x();
+
+    GADGET_MSG("max RO : " << maxRO_);
+
+    return GADGET_OK;
+}
+
+int addPrePostZeros(size_t centre_column, size_t samples)
+{
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    if ( 2*centre_column == samples )
+    {
+        return 0;
+    }
+
+    if ( 2*centre_column < samples )
+    {
+        return 1;
+    }
+
+    if ( 2*centre_column > samples )
+    {
+        return 2;
+    }
+
+    return 0;
+}
+
+int PartialFourierAdjustROGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    size_t channels = m1->getObjectPtr()->active_channels;
+    size_t samples = m1->getObjectPtr()->number_of_samples;
+    size_t centre_column = m1->getObjectPtr()->center_sample;
+
+    if (!is_noise) 
+    {
+        // adjust the center echo
+        int az = addPrePostZeros(centre_column, samples);
+
+        if ( az!= 0 && samples < maxRO_ )
+        {
+            GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+            if (!m3)
+            {
+                return GADGET_FAIL;
+            }
+
+            std::vector<size_t> data_out_dims = *m2->getObjectPtr()->get_dimensions();
+            data_out_dims[0] = maxRO_;
+            try
+            {
+                m3->getObjectPtr()->create(&data_out_dims);
+            }
+            catch(...)
+            {
+                GADGET_DEBUG1("Unable to create new data array for downsampled data\n");
+                return GADGET_FAIL;
+            }
+            m3->getObjectPtr()->fill(0);
+
+            std::complex<float>* pM3 = m3->getObjectPtr()->get_data_ptr();
+            std::complex<float>* pM2 = m2->getObjectPtr()->get_data_ptr();
+
+            size_t c;
+            if ( az == 1 ) // pre zeros
+            {
+                for ( c=0; c<channels; c++ )
+                {
+                    memcpy(pM3+c*maxRO_+maxRO_-samples, pM2+c*samples, sizeof( std::complex<float> )*samples);
+                }
+            }
+
+            if ( az == 2 ) // post zeros
+            {
+                for ( c=0; c<channels; c++ )
+                {
+                    memcpy(pM3+c*maxRO_, pM2+c*samples, sizeof( std::complex<float> )*samples);
+                }
+            }
+
+            m2->release(); //We are done with this data
+
+            m1->cont(m3);
+            m1->getObjectPtr()->number_of_samples = data_out_dims[0];
+        }
+
+        if (this->next()->putq(m1) == -1) 
+        {
+            ACE_ERROR_RETURN( (LM_ERROR,
+                    ACE_TEXT("%p\n"),
+                    ACE_TEXT("NoiseAdjustGadget::process, passing data on to next gadget")),
+                    -1);
+        }
+    }
+    else
+    {
+        if (this->next()->putq(m1) == -1) 
+        {
+            ACE_ERROR_RETURN( (LM_ERROR,
+                    ACE_TEXT("%p\n"),
+                    ACE_TEXT("NoiseAdjustGadget::process, passing data on to next gadget")),
+                    -1);
+        }
+    }
+
+    return GADGET_OK;
+}
+
+GADGET_FACTORY_DECLARE(PartialFourierAdjustROGadget)
+
+}
diff --git a/gadgets/mri_core/PartialFourierAdjustROGadget.h b/gadgets/mri_core/PartialFourierAdjustROGadget.h
new file mode 100644
index 0000000..7477fd6
--- /dev/null
+++ b/gadgets/mri_core/PartialFourierAdjustROGadget.h
@@ -0,0 +1,32 @@
+
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include "gadgetron_mricore_export.h"
+
+namespace Gadgetron
+{
+
+/// for incoming readout
+/// if not the noise scan and the partial fourier along readout is detected
+/// the readout data will be realigned with center of echo at the centre of incoming 1D array
+class EXPORTGADGETSMRICORE PartialFourierAdjustROGadget : public Gadgetron::Gadget2<ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+{
+public:
+
+    GADGET_DECLARE(PartialFourierAdjustROGadget);
+
+    PartialFourierAdjustROGadget();
+
+protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    virtual int process(Gadgetron::GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+        Gadgetron::GadgetContainerMessage< Gadgetron::hoNDArray< std::complex<float> > >* m2);
+
+    unsigned int maxRO_;
+};
+
+}
diff --git a/gadgets/mri_core/PhysioInterpolationGadget.cpp b/gadgets/mri_core/PhysioInterpolationGadget.cpp
new file mode 100644
index 0000000..d93de2a
--- /dev/null
+++ b/gadgets/mri_core/PhysioInterpolationGadget.cpp
@@ -0,0 +1,259 @@
+#include "PhysioInterpolationGadget.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "GadgetronTimer.h"
+#include "Spline.h"
+
+#include <numeric>
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  PhysioInterpolationGadget::PhysioInterpolationGadget() 
+    : phys_time_index_(0)
+    , phases_to_reconstruct_(30)
+    , buffer_(ACE_Message_Queue_Base::DEFAULT_HWM * 10, ACE_Message_Queue_Base::DEFAULT_LWM * 10)
+  {
+    set_parameter(std::string("physiology_time_index").c_str(), "0");
+    set_parameter(std::string("mode").c_str(), "0");
+    set_parameter(std::string("phases").c_str(), "30");
+  }
+
+  PhysioInterpolationGadget::~PhysioInterpolationGadget() {}
+
+  int PhysioInterpolationGadget::process_config(ACE_Message_Block* mb)
+  {
+    phys_time_index_ = get_int_value("physiology_time_index");
+    phases_to_reconstruct_ = get_int_value("phases");
+    mode_ = get_int_value("mode");
+    return GADGET_OK;
+  }
+
+  int PhysioInterpolationGadget::close(unsigned long flags) {
+    
+    int ret = Gadget::close(flags);
+
+    GADGET_DEBUG1("PhysioInterpolationGadget::close...\n");
+
+    GADGET_DEBUG2("Number of items on Q: %d\n", buffer_.message_count());
+
+    if (time_stamps_.size() != buffer_.message_count()) {
+      GADGET_DEBUG1("Inconsistent number of messages and time stamps\n");
+      buffer_.flush();
+      return GADGET_FAIL;
+    }
+    
+    float previous = -100.0;
+    float sum_int  = 0.0; 
+    std::vector<float> intervals;
+    float int_count = 0.0;
+    std::vector<size_t> cycle_starts;
+    for (size_t i = 0; i < time_stamps_.size(); i++) {
+      //GADGET_DEBUG2("Time %d, %f\n", i, time_stamps_[i]);
+      if (time_stamps_[i] < previous) {
+	cycle_starts.push_back(i);
+      } else if (i > 0 ) {
+	sum_int += time_stamps_[i]-time_stamps_[i-1];
+	intervals.push_back(time_stamps_[i]-time_stamps_[i-1]);
+	int_count += 1.0;
+      }
+      previous = time_stamps_[i];
+    }
+
+    std::sort(intervals.begin(),intervals.end());
+
+    float mean_interval = sum_int/int_count;
+    float median_interval = intervals[(intervals.size()>>1)];
+
+    float average_cycle_length = 0.0;
+    std::vector<float> cycle_lengths;
+    float count = 0;
+    for (size_t i = 1; i < cycle_starts.size(); i++) {
+      float clength = time_stamps_[cycle_starts[i]-1] + median_interval - time_stamps_[cycle_starts[i]];
+      //GADGET_DEBUG2("clength: %f\n", clength);
+      cycle_lengths.push_back(clength);
+    }
+
+    //GADGET_DEBUG2("Cycle starts: %d, cycle_lengths: %d\n", cycle_starts.size(), cycle_lengths.size());
+    //for (unsigned int i = 0; i < cycle_starts.size(); i++) {
+    //  GADGET_DEBUG2("\t%d,%f\n",cycle_starts[i], cycle_lengths[i]);
+    //} 
+
+    std::sort(cycle_lengths.begin(),cycle_lengths.end());
+    float mean_cycle_length = std::accumulate(cycle_lengths.begin(), cycle_lengths.end(), 0.0)/cycle_lengths.size();
+    float median_cycle_length = cycle_lengths[(cycle_lengths.size()>>1)];
+
+    GADGET_DEBUG2("We have %d full cyles, first one starting at %d\n", cycle_starts.size()-1, cycle_starts[0]);
+    GADGET_DEBUG2("Mean/Median frame width %f/%f\n", mean_interval,median_interval);
+    GADGET_DEBUG2("Mean/Median cycle_length %f/%f\n", mean_cycle_length,median_cycle_length);
+
+    //Correct the first cycle assuming it is of median length:
+    float first_cycle_offset = (median_cycle_length-median_interval)+time_stamps_[cycle_starts[0]]-time_stamps_[cycle_starts[0]-1];
+    for (size_t i = 0; i < cycle_starts[0]; i++) {
+      time_stamps_[i] += first_cycle_offset;
+    }
+
+    //Calculate relative time stamps
+    size_t current_cycle = 0;
+    std::vector<float> relative_cycle_time;
+
+    //Make sure we have cycle lengths for all the cycles we have covered
+    cycle_lengths.insert(cycle_lengths.begin(),median_cycle_length);
+    cycle_lengths.push_back(median_cycle_length);
+
+    for (size_t i = 0; i < time_stamps_.size(); i++) {
+      if ((i >= cycle_starts[current_cycle]) && (current_cycle < cycle_starts.size())) {
+      //GADGET_DEBUG2("Incrementing current_cycle, %d,%d\n",i,cycle_starts[current_cycle]);
+    current_cycle++;
+      }
+      relative_cycle_time.push_back(time_stamps_[i]/cycle_lengths[current_cycle] + current_cycle);
+	//GADGET_DEBUG2("Corrected time stamps: %d, %f  (%d)\n",i,relative_cycle_time[i],current_cycle);
+    }
+    
+    //Make a temporary list of all the data pointers from the Q
+    std::vector< ISMRMRD::ImageHeader* > hptrs;
+    std::vector< hoNDArray< std::complex<float> > * > aptrs;
+    
+    ACE_Message_Queue<ACE_MT_SYNCH>::ITERATOR it(buffer_);
+    for (ACE_Message_Block* entry = 0;
+	 it.next (entry) != 0;
+         it.advance ()) 
+      {
+	GadgetContainerMessage< ISMRMRD::ImageHeader >* tmpm1 =
+	  AsContainerMessage< ISMRMRD::ImageHeader >(entry);
+
+	GadgetContainerMessage< hoNDArray< std::complex<float> > > * tmpm2 = 
+	  AsContainerMessage< hoNDArray< std::complex<float> >  >(entry->cont());
+	
+	if (!tmpm1 || !tmpm2) {
+	  GADGET_DEBUG1("Failed to cast data on Q, bailing out\n");
+	  buffer_.flush();
+	  return GADGET_FAIL;
+	}
+	hptrs.push_back(tmpm1->getObjectPtr());
+	aptrs.push_back(tmpm2->getObjectPtr());	
+      }
+
+    //Let's figure out which time points we would like to interpolate on:
+    ///TODO: Deal with mode 1 and other future modes, we are only implementing mode 0 at the moment
+    float phase_interval = 1.0f/static_cast<float>(phases_to_reconstruct_);
+    float max_time = floor(relative_cycle_time[relative_cycle_time.size()-1]);
+    std::vector<float> recon_cycle_time;
+    for (float t=1.0;t<(max_time-0.001);t+=phase_interval) {
+      recon_cycle_time.push_back(t);
+    }
+    
+
+    //Now we can loop over each pixel and estimate the new frames, but first we have to have somewhere to put the data
+    std::vector< GadgetContainerMessage< ISMRMRD::ImageHeader >* > out_heads;
+    std::vector< GadgetContainerMessage< hoNDArray< std::complex<float> > > * > out_data;
+    
+    for (size_t i = 0; i < recon_cycle_time.size(); i++) {
+      GadgetContainerMessage<ISMRMRD::ImageHeader>* tmpm1 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+      GadgetContainerMessage< hoNDArray< std::complex<float> > >* tmpm2 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+      
+      tmpm1->cont(tmpm2);
+
+      (*tmpm1->getObjectPtr()) = (*hptrs[0]);
+      tmpm2->getObjectPtr()->create(aptrs[0]->get_dimensions());
+
+      out_heads.push_back(tmpm1);
+      out_data.push_back(tmpm2);
+
+      unsigned short current_cycle = static_cast<unsigned short>(floor(recon_cycle_time[i] + 0.0001));
+      unsigned short current_phase = static_cast<unsigned short>((recon_cycle_time[i]+0.0001-current_cycle)/(1.0/static_cast<float>(phases_to_reconstruct_)) + 0.0001);
+
+      tmpm1->getObjectPtr()->physiology_time_stamp[phys_time_index_] = static_cast<unsigned>(floor((recon_cycle_time[i]+0.0001-current_cycle)*cycle_lengths[current_cycle])); 
+      tmpm1->getObjectPtr()->phase = current_phase;
+      tmpm1->getObjectPtr()->image_index = current_phase+1;
+      tmpm1->getObjectPtr()->image_series_index = current_cycle*10;
+      
+      /*
+      GADGET_DEBUG2("new_time: %f, %d, time_stamp: %d, phase: %d, index: %d, series: %d\n",
+		    recon_cycle_time[i],
+		    current_cycle,
+		    tmpm1->getObjectPtr()->physiology_time_stamp[phys_time_index_],
+		    tmpm1->getObjectPtr()->phase,
+		    tmpm1->getObjectPtr()->image_index,
+		    tmpm1->getObjectPtr()->image_series_index);
+
+      */
+    }
+
+
+    //Let's interpolate the images
+    unsigned inelem = relative_cycle_time.size();
+    unsigned outelem = recon_cycle_time.size();
+    unsigned imageelem = aptrs[0]->get_number_of_elements();
+
+    {
+
+      GadgetronTimer interptime("Interpolation Time");
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for (int p = 0; p < (int)imageelem; p++) {
+      std::vector< std::complex<float> > data_in(inelem);
+
+      //Get the input data for this pixel
+      for (size_t i = 0; i < inelem; i++) data_in[i] = aptrs[i]->get_data_ptr()[p];
+      
+      //Interpolate the data
+      Spline<float, std::complex<float> > sp(relative_cycle_time, data_in);
+      std::vector<std::complex<float> > data_out = sp[recon_cycle_time];
+
+      //Copy it to the images
+      for (size_t i = 0; i < outelem; i++) out_data[i]->getObjectPtr()->get_data_ptr()[p] = data_out[i];
+    }
+
+    }
+
+    //Send out the images
+    for (size_t i = 0; i < out_heads.size(); i++) {
+      if (this->next()->putq(out_heads[i]) < 0) {
+	GADGET_DEBUG1("Unable to put data on next Gadgets Q\n");
+	return GADGET_FAIL;
+      }
+    }
+
+
+    //We can get rid of the buffered data now
+    buffer_.flush();
+
+    return ret;
+  }
+
+  int PhysioInterpolationGadget::
+  process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+  {
+        
+    GadgetContainerMessage<ISMRMRD::ImageHeader>* m3 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* m4 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+
+    
+    (*m3->getObjectPtr()) = (*m1->getObjectPtr());
+    (*m4->getObjectPtr()) = (*m2->getObjectPtr());
+    m3->cont(m4);
+
+    if (buffer_.enqueue_tail(m3) < 0) {
+      GADGET_DEBUG1("Failed to add image to buffer\n");
+      m3->release();
+      return GADGET_FAIL;
+    }
+
+    time_stamps_.push_back(m1->getObjectPtr()->physiology_time_stamp[phys_time_index_]);
+
+    if (this->next()->putq(m1) < 0) {
+      GADGET_DEBUG1("Unable to put data on next Gadgets Q\n");
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(PhysioInterpolationGadget)
+}
diff --git a/gadgets/mri_core/PhysioInterpolationGadget.h b/gadgets/mri_core/PhysioInterpolationGadget.h
new file mode 100644
index 0000000..38c5972
--- /dev/null
+++ b/gadgets/mri_core/PhysioInterpolationGadget.h
@@ -0,0 +1,44 @@
+#ifndef PhysioInterpolationGadget_H
+#define PhysioInterpolationGadget_H
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{  
+
+    class EXPORTGADGETSMRICORE PhysioInterpolationGadget :
+        public Gadget2< ISMRMRD::ImageHeader, hoNDArray< std::complex<float> > >
+    {
+
+    public:
+        GADGET_DECLARE(PhysioInterpolationGadget);
+
+        PhysioInterpolationGadget();
+        virtual ~PhysioInterpolationGadget();
+
+	inline unsigned short get_number_of_phases() { return phases_to_reconstruct_; }
+
+    protected:
+        virtual int process_config(ACE_Message_Block* mb);
+
+        virtual int process(GadgetContainerMessage< ISMRMRD::ImageHeader >* m1,
+            GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+	
+	virtual int close(unsigned long flags); //All the work is done here in this Gadget
+
+	unsigned short phys_time_index_;
+	unsigned short phases_to_reconstruct_;
+	unsigned short mode_; //0=seperate series for each complete RR,
+	                      //1=First complete RR interval only	
+    private:
+	ACE_Message_Queue<ACE_MT_SYNCH> buffer_;
+	std::vector<float> time_stamps_;
+    };
+}
+
+#endif //PhysioInterpolationGadget_H
diff --git a/gadgets/mri_core/RemoveROOversamplingGadget.cpp b/gadgets/mri_core/RemoveROOversamplingGadget.cpp
new file mode 100644
index 0000000..0f860ad
--- /dev/null
+++ b/gadgets/mri_core/RemoveROOversamplingGadget.cpp
@@ -0,0 +1,59 @@
+#include "GadgetIsmrmrdReadWrite.h"
+#include "RemoveROOversamplingGadget.h"
+#include "Gadgetron.h"
+#include "hoNDFFT.h"
+
+namespace Gadgetron{
+
+int RemoveROOversamplingGadget
+::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m3 
+    = new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+
+  if (!m3) {
+    return GADGET_FAIL;
+  }
+
+  std::vector<size_t> data_out_dims = *m2->getObjectPtr()->get_dimensions();
+  data_out_dims[0] = data_out_dims[0]/2;
+
+  try{ m3->getObjectPtr()->create(&data_out_dims);}
+  catch (std::runtime_error &err){
+    GADGET_DEBUG_EXCEPTION(err,"Unable to create new data array for downsampled data\n");
+    return GADGET_FAIL;
+  }
+
+  hoNDFFT<float>::instance()->ifft(m2->getObjectPtr(),0);
+  
+  std::complex<float>* data_in  = m2->getObjectPtr()->get_data_ptr();
+  std::complex<float>* data_out = m3->getObjectPtr()->get_data_ptr();
+
+  for (unsigned int c = 0; c < data_out_dims[1]; c++) {
+    size_t offset_in = c*m2->getObjectPtr()->get_size(0) +  (m2->getObjectPtr()->get_size(0)-data_out_dims[0])/2;
+    size_t offset_out = c*m3->getObjectPtr()->get_size(0);
+    memcpy(data_out+offset_out,data_in+offset_in,data_out_dims[0]*sizeof(std::complex<float>));
+  }
+
+  hoNDFFT<float>::instance()->fft(m3->getObjectPtr(),0);
+  
+  m2->release(); //We are done with this data
+
+  m1->cont(m3);
+  m1->getObjectPtr()->number_of_samples = data_out_dims[0];
+  m1->getObjectPtr()->center_sample /= 2;
+
+  if (this->next()->putq(m1) == -1) {
+    ACE_ERROR_RETURN( (LM_ERROR,
+		       ACE_TEXT("%p\n"),
+		       ACE_TEXT("RemoveROOversamplingGadget::process, passing data on to next gadget")),
+		      GADGET_FAIL);
+  }
+
+  return GADGET_OK;
+}
+
+
+GADGET_FACTORY_DECLARE(RemoveROOversamplingGadget)
+}
diff --git a/gadgets/mri_core/RemoveROOversamplingGadget.h b/gadgets/mri_core/RemoveROOversamplingGadget.h
new file mode 100644
index 0000000..103422c
--- /dev/null
+++ b/gadgets/mri_core/RemoveROOversamplingGadget.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "gadgetron_mricore_export.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSMRICORE RemoveROOversamplingGadget :
+  public Gadget2<ISMRMRD::AcquisitionHeader,hoNDArray< std::complex<float> > >
+    {
+    public:
+      GADGET_DECLARE(RemoveROOversamplingGadget);
+      
+    protected:
+      virtual int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+			  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+    };  
+}
diff --git a/gadgets/mri_core/Spline.h b/gadgets/mri_core/Spline.h
new file mode 100644
index 0000000..3fbc058
--- /dev/null
+++ b/gadgets/mri_core/Spline.h
@@ -0,0 +1,129 @@
+/* "THE BEER-WARE LICENSE" (Revision 42): Devin Lane wrote this file. As long as you retain 
+ * this notice you can do whatever you want with this stuff. If we meet some day, and you
+ * think this stuff is worth it, you can buy me a beer in return. */
+
+#include <vector>
+#include <iostream>
+
+/** Templated on type of X, Y. X and Y must have operator +, -, *, /. Y must have defined
+ * a constructor that takes a scalar. */
+template <typename X, typename Y>
+  class Spline {
+ public:
+  /** An empty, invalid spline */
+  Spline() {}
+    
+  /** A spline with x and y values */
+  Spline(const std::vector<X>& x, const std::vector<Y>& y) {
+    if (x.size() != y.size()) {
+      std::cerr << "X and Y must be the same size " << std::endl;
+      return;
+    }
+        
+    if (x.size() < 3) {
+      std::cerr << "Must have at least three points for interpolation" << std::endl;
+      return;
+    }
+        
+    typedef typename std::vector<X>::difference_type size_type;
+        
+    size_type n = y.size() - 1;
+        
+    std::vector<Y> b(n), d(n), a(n), c(n+1), l(n+1), u(n+1), z(n+1);
+    std::vector<X> h(n+1);
+
+    l[0] = Y(1);
+    u[0] = Y(0);
+    z[0] = Y(0);
+    h[0] = x[1] - x[0];
+            
+    for (size_type i = 1; i < n; i++) {
+      h[i] = x[i+1] - x[i];
+      l[i] = Y(2 * (x[i+1] - x[i-1])) - Y(h[i-1]) * u[i-1];
+      u[i] = Y(h[i]) / l[i];
+      a[i] = (Y(3) / Y(h[i])) * (y[i+1] - y[i]) - (Y(3) / Y(h[i-1])) * (y[i] - y[i-1]);
+      z[i] = (a[i] - Y(h[i-1]) * z[i-1]) / l[i];
+    }
+            
+    l[n] = Y(1);
+    z[n] = c[n] = Y(0);
+        
+    for (size_type j = n-1; j >= 0; j--) {
+      c[j] = z[j] - u[j] * c[j+1];
+      b[j] = (y[j+1] - y[j]) / Y(h[j]) - (Y(h[j]) * (c[j+1] + Y(2) * c[j])) / Y(3);
+      d[j] = (c[j+1] - c[j]) / Y(3 * h[j]);
+    }
+        
+    for (size_type i = 0; i < n; i++) {
+      mElements.push_back(Element(x[i], y[i], b[i], c[i], d[i]));
+    }        
+  }
+  virtual ~Spline() {}
+    
+  Y operator[](const X& x) const {
+    return interpolate(x);
+  }
+    
+  Y interpolate(const X&x) const {
+    if (mElements.size() == 0) return Y();
+        
+    typename std::vector<element_type>::const_iterator it;
+    it = std::lower_bound(mElements.begin(), mElements.end(), element_type(x));
+    if (it != mElements.begin()) {
+      it--;
+    }   
+            
+    return it->eval(x);
+  }
+    
+  std::vector<Y> operator[](const std::vector<X>& xx) const {
+    return interpolate(xx);
+  }
+    
+  /* Evaluate at multiple locations, assuming xx is sorted ascending */
+  std::vector<Y> interpolate(const std::vector<X>& xx) const {
+    if (mElements.size() == 0) return std::vector<Y>(xx.size());
+        
+    typename std::vector<X>::const_iterator it;
+    typename std::vector<element_type>::const_iterator it2;
+    it2 = mElements.begin();
+    std::vector<Y> ys;
+    for (it = xx.begin(); it != xx.end(); it++) {
+      it2 = std::lower_bound(it2, mElements.end(), element_type(*it));
+      if (it2 != mElements.begin()) {
+	it2--;
+      }
+                
+      ys.push_back(it2->eval(*it));
+    }
+
+    return ys;
+  }
+
+ protected:
+    
+  class Element {
+  public:
+  Element(X _x) : x(_x) {}
+    Element(X _x, Y _a, Y _b, Y _c, Y _d)
+      : x(_x), a(_a), b(_b), c(_c), d(_d) {}
+        
+    Y eval(const X& xx) const {
+      X xix(xx - x);
+      return a + b * xix + c * (xix * xix) + d * (xix * xix * xix);
+    }
+        
+    bool operator<(const Element& e) const {
+      return x < e.x;
+    }
+    bool operator<(const X& xx) const {
+      return x < xx;
+    }
+        
+    X x;
+    Y a, b, c, d;
+  };
+            
+  typedef Element element_type;
+  std::vector<element_type> mElements;
+};
diff --git a/gadgets/mri_core/default.xml b/gadgets/mri_core/default.xml
new file mode 100644
index 0000000..b0f32a4
--- /dev/null
+++ b/gadgets/mri_core/default.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+  
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/default_optimized.xml b/gadgets/mri_core/default_optimized.xml
new file mode 100644
index 0000000..4bf7adc
--- /dev/null
+++ b/gadgets/mri_core/default_optimized.xml
@@ -0,0 +1,117 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>Acc</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AccumulatorGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>FFT</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FFTGadget</classname>
+  </gadget>
+
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>CropCombine</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CropAndCombineGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->
+  
+  <!--
+      <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+      </gadget>
+  -->
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+
+  <gadget>
+    <name>ImageFinishFLOAT</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetFLOAT</classname>
+  </gadget>
+
+  <!--
+      <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+      </gadget>
+  -->
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/default_short.xml b/gadgets/mri_core/default_short.xml
new file mode 100644
index 0000000..098c6d3
--- /dev/null
+++ b/gadgets/mri_core/default_short.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+ 
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+ 
+     <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/mri_core/gadgetron_mricore_export.h b/gadgets/mri_core/gadgetron_mricore_export.h
new file mode 100644
index 0000000..68e869e
--- /dev/null
+++ b/gadgets/mri_core/gadgetron_mricore_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_MRICORE_EXPORT_H_
+#define GADGETRON_MRICORE_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_MRICORE__)
+#define EXPORTGADGETSMRICORE __declspec(dllexport)
+#else
+#define EXPORTGADGETSMRICORE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSMRICORE
+#endif
+
+#endif /* GADGETRON_MRICORE_EXPORT_H_ */
diff --git a/gadgets/mri_core/ismrmrd_dump.xml b/gadgets/mri_core/ismrmrd_dump.xml
new file mode 100644
index 0000000..4f79ba6
--- /dev/null
+++ b/gadgets/mri_core/ismrmrd_dump.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>IsmrmrdDump</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>IsmrmrdDumpGadget</classname>
+    <property><name>file_prefix</name><value>ISMRMRD_DUMP</value></property>
+    <property><name>append_timestamp</name><value>1</value></property>
+  </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/octave/CMakeLists.txt b/gadgets/octave/CMakeLists.txt
new file mode 100644
index 0000000..64725e4
--- /dev/null
+++ b/gadgets/octave/CMakeLists.txt
@@ -0,0 +1,43 @@
+find_package(Ismrmrd REQUIRED)
+find_package(Octave REQUIRED)
+
+link_directories(${OCTAVE_LINK_DIRS})
+
+add_library(gadgetron_octavecommunicator SHARED OctaveCommunicator.cpp)
+target_link_libraries(gadgetron_octavecommunicator ${OCTAVE_LIBRARY} optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY})
+
+add_library(GadgetronReturnIsmrmrdAcquisition MODULE GadgetronReturnIsmrmrdAcquisition.cpp)
+SET_TARGET_PROPERTIES(GadgetronReturnIsmrmrdAcquisition PROPERTIES SUFFIX .oct PREFIX "")
+target_link_libraries(GadgetronReturnIsmrmrdAcquisition ${OCTAVE_LIBRARY} gadgetron_octavecommunicator optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY})
+
+add_library(XMLGetXPath MODULE pugixml.cpp XMLGetXPath.cpp)
+SET_TARGET_PROPERTIES(XMLGetXPath PROPERTIES SUFFIX .oct PREFIX "")
+target_link_libraries(XMLGetXPath ${OCTAVE_LIBRARY})
+
+add_library(GadgetronReturnIsmrmrdImage MODULE GadgetronReturnIsmrmrdImage.cpp)
+SET_TARGET_PROPERTIES(GadgetronReturnIsmrmrdImage PROPERTIES SUFFIX .oct PREFIX "")
+target_link_libraries(GadgetronReturnIsmrmrdImage ${OCTAVE_LIBRARY} gadgetron_octavecommunicator optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY})
+
+add_library(gadgetron_octave SHARED OctaveGadget.cpp)
+target_link_libraries(gadgetron_octave ${OCTAVE_LIBRARY} gadgetron_octavecommunicator cpucore)
+	
+install(TARGETS gadgetron_octavecommunicator DESTINATION lib)
+install(TARGETS gadgetron_octave DESTINATION lib)
+install(TARGETS GadgetronReturnIsmrmrdAcquisition DESTINATION octave)
+install(TARGETS GadgetronReturnIsmrmrdImage DESTINATION octave)
+install(TARGETS XMLGetXPath DESTINATION octave)
+
+install(FILES octave/my_gadget_reference.m 
+              octave/my_recon_function.m 
+              octave/my_config_function.m 
+              octave/gadget_reference_downsample_2x.m 
+              octave/downsample_2x.m 
+              octave/configure_downsample_2x.m 
+              octave/ismrm_transform_kspace_to_image.m 
+              octave/ismrm_transform_image_to_kspace.m 
+              octave/gadget_reference_accumulator.m 
+              octave/accumulator.m 
+              octave/configure_accumulator.m 
+        DESTINATION octave)
+
+install(FILES octave.xml DESTINATION config)
diff --git a/gadgets/octave/GadgetronReturnIsmrmrdAcquisition.cpp b/gadgets/octave/GadgetronReturnIsmrmrdAcquisition.cpp
new file mode 100644
index 0000000..d3e2129
--- /dev/null
+++ b/gadgets/octave/GadgetronReturnIsmrmrdAcquisition.cpp
@@ -0,0 +1,136 @@
+#include <octave/oct.h>
+#include <octave/ov-struct.h>
+
+#include "GadgetContainerMessage.h"
+#include "ismrmrd.h"
+#include "hoNDArray.h"
+#include "OctaveCommunicator.h"
+
+using namespace Gadgetron;
+
+DEFUN_DLD (GadgetronReturnIsmrmrdAcquisition, args, nargout,
+	   "GadgetronReturnIsmrmrdAcquisition Returns Acquisition to the Gadgetron")
+{
+  int nargin = args.length ();
+
+  octave_value retval;
+     
+  if (nargin != 3) {
+    print_usage(); 
+  } else {
+    std::string id(args(0).string_value());
+    Octave_map h(args(1).map_value());
+    FloatComplexNDArray d(args(2).complex_array_value());
+
+    GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1 =
+    		new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+
+    ISMRMRD::AcquisitionHeader* head = m1->getObjectPtr();
+
+    head->version = octave_value(h.contents("version")(0)).uint16_scalar_value();
+    head->flags = octave_value(h.contents("flags")(0)).uint64_scalar_value();
+    head->measurement_uid = octave_value(h.contents("measurement_uid")(0)).uint32_scalar_value();
+    head->scan_counter = octave_value(h.contents("scan_counter")(0)).uint32_scalar_value();
+    head->acquisition_time_stamp = octave_value(h.contents("acquisition_time_stamp")(0)).uint32_scalar_value();
+    head->measurement_uid = octave_value(h.contents("measurement_uid")(0)).uint32_scalar_value();
+    head->physiology_time_stamp[0] = octave_value(h.contents("physiology_time_stamp")(0)).uint32_array_value()(0);
+    head->physiology_time_stamp[1] = octave_value(h.contents("physiology_time_stamp")(0)).uint32_array_value()(1);
+    head->physiology_time_stamp[2] = octave_value(h.contents("physiology_time_stamp")(0)).uint32_array_value()(2);
+    head->number_of_samples = octave_value(h.contents("number_of_samples")(0)).uint16_scalar_value();
+    head->available_channels = octave_value(h.contents("available_channels")(0)).uint16_scalar_value();
+    head->active_channels = octave_value(h.contents("active_channels")(0)).uint16_scalar_value();
+    head->channel_mask[0] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(0);
+    head->channel_mask[1] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(1);
+    head->channel_mask[2] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(2);
+    head->channel_mask[3] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(3);
+    head->channel_mask[4] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(4);
+    head->channel_mask[5] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(5);
+    head->channel_mask[6] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(6);
+    head->channel_mask[7] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(7);
+    head->channel_mask[8] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(8);
+    head->channel_mask[9] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(9);
+    head->channel_mask[10] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(10);
+    head->channel_mask[11] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(11);
+    head->channel_mask[12] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(12);
+    head->channel_mask[13] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(13);
+    head->channel_mask[14] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(14);
+    head->channel_mask[15] = octave_value(h.contents("channel_mask")(0)).uint64_array_value()(15);
+    head->discard_pre = octave_value(h.contents("discard_pre")(0)).uint16_scalar_value();
+    head->discard_post = octave_value(h.contents("discard_post")(0)).uint16_scalar_value();
+    head->center_sample = octave_value(h.contents("center_sample")(0)).uint16_scalar_value();
+    head->encoding_space_ref = octave_value(h.contents("encoding_space_ref")(0)).uint16_scalar_value();
+    head->trajectory_dimensions = octave_value(h.contents("trajectory_dimensions")(0)).uint16_scalar_value();
+    head->sample_time_us = octave_value(h.contents("sample_time_us")(0)).float_scalar_value();
+    head->position[0] = octave_value(h.contents("position")(0)).float_array_value()(0);
+    head->position[1] = octave_value(h.contents("position")(0)).float_array_value()(1);
+    head->position[2] = octave_value(h.contents("position")(0)).float_array_value()(2);
+    head->read_dir[0] = octave_value(h.contents("read_dir")(0)).float_array_value()(0);
+    head->read_dir[1] = octave_value(h.contents("read_dir")(0)).float_array_value()(1);
+    head->read_dir[2] = octave_value(h.contents("read_dir")(0)).float_array_value()(2);
+    head->phase_dir[0] = octave_value(h.contents("phase_dir")(0)).float_array_value()(0);
+    head->phase_dir[1] = octave_value(h.contents("phase_dir")(0)).float_array_value()(1);
+    head->phase_dir[2] = octave_value(h.contents("phase_dir")(0)).float_array_value()(2);
+    head->slice_dir[0] = octave_value(h.contents("read_dir")(0)).float_array_value()(0);
+    head->slice_dir[1] = octave_value(h.contents("read_dir")(0)).float_array_value()(1);
+    head->slice_dir[2] = octave_value(h.contents("read_dir")(0)).float_array_value()(2);
+    head->patient_table_position[0] = octave_value(h.contents("patient_table_position")(0)).float_array_value()(0);
+    head->patient_table_position[1] = octave_value(h.contents("patient_table_position")(0)).float_array_value()(1);
+    head->patient_table_position[2] = octave_value(h.contents("patient_table_position")(0)).float_array_value()(2);
+    head->idx.kspace_encode_step_1 = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("kspace_encode_step_1")(0)).uint16_scalar_value();
+    head->idx.kspace_encode_step_2 = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("kspace_encode_step_2")(0)).uint16_scalar_value();
+    head->idx.average              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("average")(0)).uint16_scalar_value();
+    head->idx.slice                = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("slice")(0)).uint16_scalar_value();
+    head->idx.contrast             = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("contrast")(0)).uint16_scalar_value();
+    head->idx.phase                = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("phase")(0)).uint16_scalar_value();
+    head->idx.repetition           = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("repetition")(0)).uint16_scalar_value();
+    head->idx.set                  = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("set")(0)).uint16_scalar_value();
+    head->idx.segment              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("segment")(0)).uint16_scalar_value();
+    head->idx.user[0]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(0);
+    head->idx.user[1]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(1);
+    head->idx.user[2]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(2);
+    head->idx.user[3]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(3);
+    head->idx.user[4]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(4);
+    head->idx.user[5]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(5);
+    head->idx.user[6]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(6);
+    head->idx.user[7]              = octave_value(octave_value(h.contents("idx")(0)).map_value().contents("user")(0)).uint16_array_value()(7);
+    head->user_int[0]              = octave_value(h.contents("user_int")(0)).int32_array_value()(0);
+    head->user_int[1]              = octave_value(h.contents("user_int")(0)).int32_array_value()(1);
+    head->user_int[2]              = octave_value(h.contents("user_int")(0)).int32_array_value()(2);
+    head->user_int[3]              = octave_value(h.contents("user_int")(0)).int32_array_value()(3);
+    head->user_int[4]              = octave_value(h.contents("user_int")(0)).int32_array_value()(4);
+    head->user_int[5]              = octave_value(h.contents("user_int")(0)).int32_array_value()(5);
+    head->user_int[6]              = octave_value(h.contents("user_int")(0)).int32_array_value()(6);
+    head->user_int[7]              = octave_value(h.contents("user_int")(0)).int32_array_value()(7);
+    head->user_float[0]            = octave_value(h.contents("user_float")(0)).int32_array_value()(0);
+    head->user_float[1]            = octave_value(h.contents("user_float")(0)).int32_array_value()(1);
+    head->user_float[2]            = octave_value(h.contents("user_float")(0)).int32_array_value()(2);
+    head->user_float[3]            = octave_value(h.contents("user_float")(0)).int32_array_value()(3);
+    head->user_float[4]            = octave_value(h.contents("user_float")(0)).int32_array_value()(4);
+    head->user_float[5]            = octave_value(h.contents("user_float")(0)).int32_array_value()(5);
+    head->user_float[6]            = octave_value(h.contents("user_float")(0)).int32_array_value()(6);
+    head->user_float[7]            = octave_value(h.contents("user_float")(0)).int32_array_value()(7);
+
+
+    GadgetContainerMessage< hoNDArray<std::complex<float> > >* m2 =
+    		new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+    std::vector<unsigned int> dims;
+    for (unsigned int i = 0; i < d.dims().length(); i++) {
+    	dims.push_back(d.dims()(i));
+    }
+
+    try {
+        m2->getObjectPtr()->create(&dims);
+    } catch (...) {
+        GADGET_DEBUG1("Failed to allocate return array\n");
+        m1->release();
+    }
+
+    memcpy(m2->getObjectPtr()->get_data_ptr(), &d(0), sizeof(float)*2*d.nelem());
+
+    m1->cont(m2);
+
+    OctaveCommunicator::instance()->message_gadget(id, m1);
+  }
+  return octave_value_list ();
+}
diff --git a/gadgets/octave/GadgetronReturnIsmrmrdImage.cpp b/gadgets/octave/GadgetronReturnIsmrmrdImage.cpp
new file mode 100644
index 0000000..7d8c8b6
--- /dev/null
+++ b/gadgets/octave/GadgetronReturnIsmrmrdImage.cpp
@@ -0,0 +1,108 @@
+#include <octave/oct.h>
+#include <octave/ov-struct.h>
+
+#include "OctaveCommunicator.h"
+#include "ismrmrd.h"
+#include "hoNDArray.h"
+
+using namespace Gadgetron;
+
+DEFUN_DLD (GadgetronReturnIsmrmrdImage, args, nargout,
+	   "GadgetronReturnIsmrmrdImage return Image to the Gadgetron")
+{
+	 int nargin = args.length ();
+
+	  octave_value retval;
+
+	  if (nargin != 3) {
+	    print_usage();
+	  } else {
+	    std::string id(args(0).string_value());
+	    Octave_map h(args(1).map_value());
+	    FloatComplexNDArray d(args(2).complex_array_value());
+
+	    GadgetContainerMessage<ISMRMRD::ImageHeader>* m1 =
+	    		new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+	    ISMRMRD::ImageHeader* head = m1->getObjectPtr();
+
+	    head->version = octave_value(h.contents("version")(0)).uint16_scalar_value();
+	    head->flags = octave_value(h.contents("flags")(0)).uint64_scalar_value();
+	    head->measurement_uid = octave_value(h.contents("measurement_uid")(0)).uint32_scalar_value();
+	    head->matrix_size[0] = octave_value(h.contents("matrix_size")(0)).uint16_array_value()(0);
+	    head->matrix_size[1] = octave_value(h.contents("matrix_size")(0)).uint16_array_value()(1);
+	    head->matrix_size[2] = octave_value(h.contents("matrix_size")(0)).uint16_array_value()(2);
+		head->field_of_view[0] = octave_value(h.contents("field_of_view")(0)).float_array_value()(0);
+	    head->field_of_view[1] = octave_value(h.contents("field_of_view")(0)).float_array_value()(1);
+	    head->field_of_view[2] = octave_value(h.contents("field_of_view")(0)).float_array_value()(2);
+		head->channels = octave_value(h.contents("channels")(0)).uint16_scalar_value();
+	    head->position[0] = octave_value(h.contents("position")(0)).float_array_value()(0);
+	    head->position[1] = octave_value(h.contents("position")(0)).float_array_value()(1);
+	    head->position[2] = octave_value(h.contents("position")(0)).float_array_value()(2);
+	    head->read_dir[0] = octave_value(h.contents("read_dir")(0)).float_array_value()(0);
+	    head->read_dir[1] = octave_value(h.contents("read_dir")(0)).float_array_value()(1);
+	    head->read_dir[2] = octave_value(h.contents("read_dir")(0)).float_array_value()(2);
+	    head->phase_dir[0] = octave_value(h.contents("phase_dir")(0)).float_array_value()(0);
+	    head->phase_dir[1] = octave_value(h.contents("phase_dir")(0)).float_array_value()(1);
+	    head->phase_dir[2] = octave_value(h.contents("phase_dir")(0)).float_array_value()(2);
+	    head->slice_dir[0] = octave_value(h.contents("slice_dir")(0)).float_array_value()(0);
+	    head->slice_dir[1] = octave_value(h.contents("slice_dir")(0)).float_array_value()(1);
+	    head->slice_dir[2] = octave_value(h.contents("slice_dir")(0)).float_array_value()(2);
+	    head->patient_table_position[0] = octave_value(h.contents("patient_table_position")(0)).float_array_value()(0);
+	    head->patient_table_position[1] = octave_value(h.contents("patient_table_position")(0)).float_array_value()(1);
+	    head->patient_table_position[2] = octave_value(h.contents("patient_table_position")(0)).float_array_value()(2);
+	    head->average = octave_value(h.contents("average")(0)).uint16_scalar_value();
+	    head->slice = octave_value(h.contents("slice")(0)).uint16_scalar_value();
+	    head->contrast = octave_value(h.contents("contrast")(0)).uint16_scalar_value();
+	    head->phase = octave_value(h.contents("phase")(0)).uint16_scalar_value();
+	    head->repetition = octave_value(h.contents("repetition")(0)).uint16_scalar_value();
+	    head->set = octave_value(h.contents("set")(0)).uint16_scalar_value();
+	    head->acquisition_time_stamp = octave_value(h.contents("acquisition_time_stamp")(0)).uint32_scalar_value();
+	    head->physiology_time_stamp[0] = octave_value(h.contents("physiology_time_stamp")(0)).uint32_array_value()(0);
+	    head->physiology_time_stamp[1] = octave_value(h.contents("physiology_time_stamp")(0)).uint32_array_value()(1);
+	    head->physiology_time_stamp[2] = octave_value(h.contents("physiology_time_stamp")(0)).uint32_array_value()(2);
+	    head->image_data_type          = octave_value(h.contents("image_data_type")(0)).uint16_scalar_value();
+	    head->image_data_type          = octave_value(h.contents("image_data_type")(0)).uint16_scalar_value();
+	    head->image_data_type          = octave_value(h.contents("image_data_type")(0)).uint16_scalar_value();
+	    head->image_data_type          = octave_value(h.contents("image_data_type")(0)).uint16_scalar_value();
+	    head->user_int[0]              = octave_value(h.contents("user_int")(0)).int32_array_value()(0);
+	    head->user_int[1]              = octave_value(h.contents("user_int")(0)).int32_array_value()(1);
+	    head->user_int[2]              = octave_value(h.contents("user_int")(0)).int32_array_value()(2);
+	    head->user_int[3]              = octave_value(h.contents("user_int")(0)).int32_array_value()(3);
+	    head->user_int[4]              = octave_value(h.contents("user_int")(0)).int32_array_value()(4);
+	    head->user_int[5]              = octave_value(h.contents("user_int")(0)).int32_array_value()(5);
+	    head->user_int[6]              = octave_value(h.contents("user_int")(0)).int32_array_value()(6);
+	    head->user_int[7]              = octave_value(h.contents("user_int")(0)).int32_array_value()(7);
+	    head->user_float[0]            = octave_value(h.contents("user_float")(0)).int32_array_value()(0);
+	    head->user_float[1]            = octave_value(h.contents("user_float")(0)).int32_array_value()(1);
+	    head->user_float[2]            = octave_value(h.contents("user_float")(0)).int32_array_value()(2);
+	    head->user_float[3]            = octave_value(h.contents("user_float")(0)).int32_array_value()(3);
+	    head->user_float[4]            = octave_value(h.contents("user_float")(0)).int32_array_value()(4);
+	    head->user_float[5]            = octave_value(h.contents("user_float")(0)).int32_array_value()(5);
+	    head->user_float[6]            = octave_value(h.contents("user_float")(0)).int32_array_value()(6);
+	    head->user_float[7]            = octave_value(h.contents("user_float")(0)).int32_array_value()(7);
+
+	    GadgetContainerMessage< hoNDArray<std::complex<float> > >* m2 =
+	    		new GadgetContainerMessage< hoNDArray<std::complex<float> > >();
+
+	    std::vector<unsigned int> dims;
+	    for (unsigned int i = 0; i < d.dims().length(); i++) {
+	    	dims.push_back(d.dims()(i));
+	    }
+
+	    try {
+	        m2->getObjectPtr()->create(&dims);
+	    } catch (...) {
+	    	GADGET_DEBUG1("Failed to allocate return array\n");
+	    	m1->release();
+	    }
+
+	    memcpy(m2->getObjectPtr()->get_data_ptr(), &d(0), sizeof(float)*2*d.nelem());
+
+	    m1->cont(m2);
+
+	    OctaveCommunicator::instance()->message_gadget(id, m1);
+	  }
+
+	  return octave_value_list ();
+}
diff --git a/gadgets/octave/OctaveCommunicator.cpp b/gadgets/octave/OctaveCommunicator.cpp
new file mode 100644
index 0000000..c0aa5f2
--- /dev/null
+++ b/gadgets/octave/OctaveCommunicator.cpp
@@ -0,0 +1,68 @@
+#include "OctaveCommunicator.h"
+
+
+#include <iostream>
+
+using namespace Gadgetron;
+
+OctaveCommunicator* OctaveCommunicator::instance()
+{
+  if (!instance_) instance_ = new OctaveCommunicator();
+  return instance_;
+}
+
+OctaveCommunicator::OctaveCommunicator()
+  : mutex_("OctaveCommunicatorMutex")
+{
+  const char * argvv [] = {"" /* name of program, not relevant */, "--silent"}; 
+  octave_main (2, (char **) argvv, true /* embedded */);
+  octave_value_list in;
+  octave_value_list out;
+
+  const char* gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+  std::string path_name = std::string(gadgetron_home) + std::string("/octave");
+  
+  in = octave_value (path_name.c_str());
+  out = feval ("addpath", in, 1);
+  
+}
+
+OctaveCommunicator::~OctaveCommunicator()
+{
+}
+
+void OctaveCommunicator::register_gadget(Gadget* g)
+{
+  mutex_.acquire();
+  gadget_map_[g->module()->name()] = g;
+  mutex_.release();
+}
+
+bool OctaveCommunicator::message_gadget(std::string gadget, ACE_Message_Block* m)
+{
+  std::map<std::string, Gadget*>::iterator it = gadget_map_.find(gadget);
+
+  if (it != gadget_map_.end()) {
+	  if (it->second->putq(m) < 0) {
+		  return false;
+	  } else {
+		  return true;
+	  }
+  } else {
+    std::cout << "Gadget with ID = " << gadget << " NOT FOUND!" << std::endl;
+    m->release();
+    return false;
+  }
+  return false;
+}
+
+octave_value_list OctaveCommunicator::octave_feval (const std::string &name, const octave_value_list &args, int nargout)
+{
+  mutex_.acquire();
+  octave_value_list out = feval(name,args,nargout);
+  mutex_.release();
+
+  return out;
+}
+
+OctaveCommunicator* OctaveCommunicator::instance_ = NULL;
diff --git a/gadgets/octave/OctaveCommunicator.h b/gadgets/octave/OctaveCommunicator.h
new file mode 100644
index 0000000..2fd2145
--- /dev/null
+++ b/gadgets/octave/OctaveCommunicator.h
@@ -0,0 +1,40 @@
+#ifndef OCTAVECOMMUNICATOR_H
+#define OCTAVECOMMUNICATOR_H
+
+#include <ace/Synch.h>
+#include <ace/Mutex.h>
+
+#include <octave/oct.h>
+#include <octave/octave.h>
+#include <octave/parse.h>
+
+
+#include "Gadget.h"
+
+#include <map>
+#include <string>
+
+#include "gadgetronoctavecommunicator_export.h"
+
+class EXPORTGADGETSOCTAVECOMMUNICATOR OctaveCommunicator
+{
+
+ public:
+  static OctaveCommunicator* instance(); 
+  
+  void register_gadget(Gadgetron::Gadget* g);
+  bool message_gadget(std::string g, ACE_Message_Block* m);
+  octave_value_list octave_feval (const std::string &name, const octave_value_list &args=octave_value_list(), int nargout=0);
+
+ private:
+  OctaveCommunicator();
+  ~OctaveCommunicator();
+  
+  static OctaveCommunicator* instance_;
+  ACE_Thread_Mutex mutex_;
+  
+  std::map<std::string, Gadgetron::Gadget*> gadget_map_;
+};
+
+
+#endif
diff --git a/gadgets/octave/OctaveGadget.cpp b/gadgets/octave/OctaveGadget.cpp
new file mode 100644
index 0000000..ea63574
--- /dev/null
+++ b/gadgets/octave/OctaveGadget.cpp
@@ -0,0 +1,232 @@
+#include "OctaveGadget.h"
+
+namespace Gadgetron {
+
+ int AcquisitionOctaveGadget::process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+	      GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+ {
+
+   //We want to avoid a deadlock for the Python GIL if this python call results in an output that the GadgetReference will not be able to get rid of.
+   //This is kind of a nasty busy wait, maybe we should add an event handler to the NotificationStrategy of the Q or something, but for now, this will do it.
+   while (this->next()->msg_queue()->is_full()) {
+     //GADGET_DEBUG2("Gadget (%s) sleeping while downstream Gadget (%s) does some work\n", this->module()->name(), this->next()->module()->name());
+     ACE_Time_Value tv(0,10000); //Sleep for 10ms while the downstream Gadget does some work
+      ACE_OS::sleep(tv);
+   }
+
+
+	Octave_map m;
+	ISMRMRD::AcquisitionHeader h = *m1->getObjectPtr();
+
+	m.assign("version",                    h.version);
+	m.assign("flags",                      h.flags);
+	m.assign("measurement_uid",            h.measurement_uid);
+	m.assign("scan_counter",               h.scan_counter);
+	m.assign("acquisition_time_stamp",     h.acquisition_time_stamp);
+
+	dim_vector d(1); d(0) = 3;
+	uint32NDArray phys_time(d);
+	memcpy(&phys_time(0),h.physiology_time_stamp,sizeof(uint32_t)*3);
+	m.assign("physiology_time_stamp",            octave_value(phys_time));
+
+
+	m.assign("number_of_samples",                h.number_of_samples);
+	m.assign("available_channels",               h.available_channels);
+	m.assign("active_channels",                  h.active_channels);
+
+	d(0) = 16;
+	uint64NDArray channel_mask(d);
+	memcpy(&channel_mask(0),h.channel_mask,sizeof(uint64_t)*16);
+	m.assign("channel_mask",                     octave_value(channel_mask));
+
+	m.assign("discard_pre",                      h.discard_pre);
+	m.assign("discard_post",                     h.discard_post);
+	m.assign("center_sample",                    h.center_sample);
+	m.assign("encoding_space_ref",               h.encoding_space_ref);
+	m.assign("trajectory_dimensions",            h.trajectory_dimensions);
+	m.assign("sample_time_us",                   h.sample_time_us);
+
+	d(0) = 3;
+	FloatNDArray position(d);
+	memcpy(&position(0),h.position,sizeof(float)*3);
+	m.assign("position",                         octave_value(position));
+
+	d(0) = 3;
+	FloatNDArray read_dir(d);
+	memcpy(&read_dir(0),h.read_dir,sizeof(float)*3);
+	m.assign("read_dir",            octave_value(read_dir));
+
+	d(0) = 3;
+	FloatNDArray phase_dir(d);
+	memcpy(&phase_dir(0),h.phase_dir,sizeof(float)*3);
+	m.assign("phase_dir",            octave_value(phase_dir));
+
+	d(0) = 3;
+	FloatNDArray slice_dir(d);
+	memcpy(&slice_dir(0),h.slice_dir,sizeof(float)*3);
+	m.assign("slice_dir",            octave_value(slice_dir));
+
+	d(0) = 3;
+	FloatNDArray patient_table_position(d);
+	memcpy(&patient_table_position(0),h.patient_table_position,sizeof(float)*3);
+	m.assign("patient_table_position",         octave_value(patient_table_position));
+
+	Octave_map idx;
+
+	idx.assign("kspace_encode_step_1",       h.idx.kspace_encode_step_1);
+	idx.assign("kspace_encode_step_2",       h.idx.kspace_encode_step_2);
+	idx.assign("average",                    h.idx.average);
+	idx.assign("slice",                      h.idx.slice);
+	idx.assign("contrast",                   h.idx.contrast);
+	idx.assign("phase",                      h.idx.phase);
+	idx.assign("repetition",                 h.idx.phase);
+	idx.assign("phase",                      h.idx.repetition);
+	idx.assign("set",                        h.idx.set);
+	idx.assign("segment",                    h.idx.segment);
+
+	d(0) = 8;
+	uint16NDArray user(d);
+	memcpy(&user(0),h.idx.user,sizeof(uint16_t)*8);
+	idx.assign("user",                    octave_value(user));
+	m.assign("idx",                         octave_value(idx));
+
+	d(0) = 8;
+	int32NDArray user_int(d);
+	memcpy(&user_int(0),h.user_int,sizeof(int32_t)*8);
+	m.assign("user_int",                         octave_value(user_int));
+
+	d(0) = 8;
+	FloatNDArray user_float(d);
+	memcpy(&user_float(0),h.user_float,sizeof(float)*8);
+	m.assign("user_float",                         octave_value(user_float));
+
+	//Make a copy of the data for sending to Octave.
+    dim_vector dims;
+    for (unsigned int i =0; i < m2->getObjectPtr()->get_number_of_dimensions(); i++) {
+    	dims(i) = m2->getObjectPtr()->get_size(i);
+    }
+    FloatComplexNDArray data(dims);
+    memcpy(data.fortran_vec(),m2->getObjectPtr()->get_data_ptr(),sizeof(float)*2*data.nelem());
+
+    octave_value_list in;
+    in(0) = m;
+    in(1) = data;
+
+    octave_value_list out = OctaveCommunicator::instance()->octave_feval (datafunc_->c_str(), in, 2);
+
+    //We are now done with the data
+    m1->release();
+
+    return GADGET_OK;
+ }
+
+ int ImageOctaveGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+	      GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+ {
+
+   //We want to avoid a deadlock for the Python GIL if this python call results in an output that the GadgetReference will not be able to get rid of.
+   //This is kind of a nasty busy wait, maybe we should add an event handler to the NotificationStrategy of the Q or something, but for now, this will do it.
+   while (this->next()->msg_queue()->is_full()) {
+     //GADGET_DEBUG2("Gadget (%s) sleeping while downstream Gadget (%s) does some work\n", this->module()->name(), this->next()->module()->name());
+     ACE_Time_Value tv(0,10000); //Sleep for 10ms while the downstream Gadget does some work
+     ACE_OS::sleep(tv);
+   }
+
+
+	Octave_map m;
+	ISMRMRD::ImageHeader h = *m1->getObjectPtr();
+
+	m.assign("version",                    h.version);
+	m.assign("flags",                      h.flags);
+	m.assign("measurement_uid",            h.measurement_uid);
+
+	dim_vector d(1);
+	d(0) = 3;
+	uint16NDArray matrix_size(d);
+	memcpy(&matrix_size(0),h.matrix_size,sizeof(uint16_t)*3);
+	m.assign("matrix_size",            octave_value(matrix_size));
+
+	d(0) = 3;
+	FloatNDArray field_of_view(d);
+	memcpy(&field_of_view(0),h.field_of_view,sizeof(float)*3);
+	m.assign("field_of_view",            octave_value(field_of_view));
+
+	m.assign("channels",                    h.channels);
+
+	d(0) = 3;
+	FloatNDArray position(d);
+	memcpy(&position(0),h.position,sizeof(float)*3);
+	m.assign("position",            octave_value(position));
+
+	d(0) = 3;
+	FloatNDArray read_dir(d);
+	memcpy(&read_dir(0),h.read_dir,sizeof(float)*3);
+	m.assign("read_dir",            octave_value(read_dir));
+
+	d(0) = 3;
+	FloatNDArray phase_dir(d);
+	memcpy(&phase_dir(0),h.phase_dir,sizeof(float)*3);
+	m.assign("phase_dir",            octave_value(phase_dir));
+
+	d(0) = 3;
+	FloatNDArray slice_dir(d);
+	memcpy(&slice_dir(0),h.slice_dir,sizeof(float)*3);
+	m.assign("slice_dir",            octave_value(slice_dir));
+
+	d(0) = 3;
+	FloatNDArray patient_table_position(d);
+	memcpy(&patient_table_position(0),h.patient_table_position,sizeof(float)*3);
+	m.assign("patient_table_position",            octave_value(patient_table_position));
+
+	m.assign("average",                    h.average);
+	m.assign("slice",                    h.slice);
+	m.assign("contrast",                    h.contrast);
+	m.assign("phase",                    h.phase);
+	m.assign("repetition",                    h.repetition);
+	m.assign("set",                    h.set);
+
+	d(0) = 3;
+	uint32NDArray physiology_time_stamp(d);
+	memcpy(&physiology_time_stamp(0),h.physiology_time_stamp,sizeof(uint32_t)*3);
+	m.assign("physiology_time_stamp",            octave_value(physiology_time_stamp));
+
+	m.assign("image_data_type",                    h.image_data_type);
+	m.assign("image_type",                    h.image_type);
+
+	m.assign("image_index",                    h.image_index);
+	m.assign("image_series_index",            h.image_series_index);
+
+	d(0) = 8;
+	int32NDArray user_int(d);
+	memcpy(&user_int(0),h.user_int,sizeof(int32_t)*8);
+	m.assign("user_int",                         octave_value(user_int));
+
+	d(0) = 8;
+	FloatNDArray user_float(d);
+	memcpy(&user_float(0),h.user_float,sizeof(float)*8);
+	m.assign("user_float",                         octave_value(user_float));
+
+    dim_vector dims;
+    for (unsigned int i =0; i < m2->getObjectPtr()->get_number_of_dimensions(); i++) {
+    	dims(i) = m2->getObjectPtr()->get_size(i);
+    }
+
+    FloatComplexNDArray data(dims);
+    memcpy(&data(0),m2->getObjectPtr()->get_data_ptr(),sizeof(float)*2*data.nelem());
+
+    octave_value_list in;
+    in(0) = m; //octave_value (this->next()->module()->name());
+    in(1) = data;
+
+    octave_value_list out = OctaveCommunicator::instance()->octave_feval (datafunc_->c_str(), in, 2);
+
+    m1->release();
+
+    return GADGET_OK;
+ }
+
+
+GADGET_FACTORY_DECLARE(AcquisitionOctaveGadget)
+GADGET_FACTORY_DECLARE(ImageOctaveGadget)
+
+}
diff --git a/gadgets/octave/OctaveGadget.h b/gadgets/octave/OctaveGadget.h
new file mode 100644
index 0000000..647de7b
--- /dev/null
+++ b/gadgets/octave/OctaveGadget.h
@@ -0,0 +1,95 @@
+#pragma once 
+
+#include <octave/oct.h>
+#include <octave/octave.h>
+#include <octave/parse.h>
+#include <octave/ov-struct.h>
+
+#include "gadgetronoctave_export.h"
+#include "Gadget.h"
+#include "Gadgetron.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+
+#include "OctaveCommunicator.h"
+#include "GadgetStreamController.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <complex>
+
+namespace Gadgetron
+{
+
+template <class T> class OctaveGadget :
+public Gadgetron::Gadget2<T, hoNDArray< std::complex<float> > >
+{
+ public:
+  //GADGET_DECLARE(OctaveGadget);
+  //virtual ~OctaveGadget();
+
+ protected:
+
+  int process_config(ACE_Message_Block* mb)
+  {
+
+    path_        = this->get_string_value("path");
+    reffunc_     = this->get_string_value("gadget_reference_function");
+    datafunc_    = this->get_string_value("input_function");
+    configfunc_  = this->get_string_value("config_function");
+
+    GADGET_DEBUG2("OCTAVE Ref Function    : %s\n", reffunc_.get()->c_str());
+    GADGET_DEBUG2("OCTAVE Data Function   : %s\n", datafunc_.get()->c_str());
+    GADGET_DEBUG2("OCTAVE Config Function : %s\n", configfunc_.get()->c_str());
+
+    OctaveCommunicator::instance()->register_gadget(this);
+    OctaveCommunicator::instance()->register_gadget(this->controller_->find_gadget(this->next()->module()->name()));
+
+    octave_value_list in = octave_value (path_->c_str());
+    octave_value_list out = OctaveCommunicator::instance()->octave_feval ("addpath", in, 1);
+
+    in(0) = octave_value(this->module()->name());
+    in(1) = octave_value(this->next()->module()->name());
+    out = OctaveCommunicator::instance()->octave_feval(reffunc_->c_str(), in, 2);
+
+    in(0) = octave_value(std::string(mb->rd_ptr(),mb->length()));
+    out = OctaveCommunicator::instance()->octave_feval(configfunc_->c_str(), in, 1);
+
+    return GADGET_OK;
+  }
+
+protected:
+  boost::shared_ptr<std::string> path_;
+  boost::shared_ptr<std::string> reffunc_;
+  boost::shared_ptr<std::string> datafunc_;
+  boost::shared_ptr<std::string> configfunc_;
+
+
+};
+
+
+
+class EXPORTGADGETSOCTAVE AcquisitionOctaveGadget :
+public OctaveGadget<ISMRMRD::AcquisitionHeader>
+{
+ public:
+  GADGET_DECLARE(AcquisitionOctaveGadget);
+  
+  int process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* m1,
+  	      GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};
+
+class EXPORTGADGETSOCTAVE ImageOctaveGadget :
+public OctaveGadget<ISMRMRD::ImageHeader>
+{
+ public:
+  GADGET_DECLARE(ImageOctaveGadget);
+
+  int process(GadgetContainerMessage<ISMRMRD::ImageHeader>* m1,
+  	      GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+};
+
+}
diff --git a/gadgets/octave/XMLGetXPath.cpp b/gadgets/octave/XMLGetXPath.cpp
new file mode 100644
index 0000000..4a2b3d1
--- /dev/null
+++ b/gadgets/octave/XMLGetXPath.cpp
@@ -0,0 +1,35 @@
+#include <octave/oct.h>
+#include <iostream>
+
+#include "pugixml.hpp"
+     
+DEFUN_DLD (XMLGetXPath, args, nargout,
+	   "XMLGetXPath: Returns the text contents of the xml node with the given XPATH")
+{
+  int nargin = args.length ();
+
+  octave_value retval;
+     
+  if (nargin != 2) {
+    print_usage(); 
+  } else {
+    std::string xml(args(0).string_value());
+    std::string xpath(args(1).string_value());
+
+    pugi::xml_document doc;
+    
+    pugi::xml_parse_result result = doc.load_buffer_inplace(const_cast<char*>(xml.c_str()), xml.length());
+
+    if (!result) {
+      std::cout << "XML parsed with errors." << std::endl;
+      std::cout << "Error description: " << result.description() << std::endl;
+      return retval;
+    }
+
+    pugi::xpath_node target_node = doc.select_single_node(xpath.c_str());
+
+    retval = octave_value(target_node.node().child_value());
+  }
+
+  return retval;
+}
diff --git a/gadgets/octave/gadgetron_octave_export.h b/gadgets/octave/gadgetron_octave_export.h
new file mode 100644
index 0000000..5ca4185
--- /dev/null
+++ b/gadgets/octave/gadgetron_octave_export.h
@@ -0,0 +1,23 @@
+/*
+ * gadgetroncore_export.h
+ *
+ *  Created on: Jan 28, 2013
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef GADGETRONOCTAVE_EXPORT_H_
+#define GADGETRONOCTAVE_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_OCTAVE__) || defined (gadgetronoctave_EXPORTS)
+#define EXPORTGADGETSOCTAVE __declspec(dllexport)
+#else
+#define EXPORTGADGETSOCTAVE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSOCTAVE
+#endif
+
+
+#endif /* GADGETRONOCTAVE_EXPORT_H_ */
diff --git a/gadgets/octave/gadgetron_octavecommunicator_export.h b/gadgets/octave/gadgetron_octavecommunicator_export.h
new file mode 100644
index 0000000..7a04095
--- /dev/null
+++ b/gadgets/octave/gadgetron_octavecommunicator_export.h
@@ -0,0 +1,23 @@
+/*
+ * gadgetroncore_export.h
+ *
+ *  Created on: Jan 28, 2013
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef GADGETRONOCTAVECOMMUNICATOR_EXPORT_H_
+#define GADGETRONOCTAVECOMMUNICATOR_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_OCTAVECOMMUNICATOR__) || defined (gadgetronOctaveCommunicator_EXPORTS)
+#define EXPORTGADGETSOCTAVECOMMUNICATOR __declspec(dllexport)
+#else
+#define EXPORTGADGETSOCTAVECOMMUNICATOR __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSOCTAVECOMMUNICATOR
+#endif
+
+
+#endif /* GADGETRONOCTAVE_EXPORT_H_ */
diff --git a/gadgets/octave/octave.xml b/gadgets/octave/octave.xml
new file mode 100644
index 0000000..5e749c6
--- /dev/null
+++ b/gadgets/octave/octave.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+  
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+    
+    <gadget>
+      <name>OctaveDownsample</name>
+      <dll>gadgetron_octave</dll>
+      <classname>AcquisitionOctaveGadget</classname>
+      <!--
+   	  <property><name>path</name>                  <value>/home/myuser/scripts/python</value></property>
+      -->
+      <property><name>gadget_reference_function</name>    <value>gadget_reference_downsample_2x</value></property>
+      <property><name>input_function</name>               <value>downsample_2x</value></property>
+      <property><name>config_function</name>              <value>configure_downsample_2x</value></property> 
+    </gadget>
+
+    <gadget>
+      <name>OctaveAccumulate</name>
+      <dll>gadgetron_octave</dll>
+      <classname>AcquisitionOctaveGadget</classname>
+      <!--
+   	  <property><name>path</name>                  <value>/home/myuser/scripts/python</value></property>
+      -->
+      <property><name>gadget_reference_function</name>    <value>gadget_reference_accumulator</value></property>
+      <property><name>input_function</name>               <value>accumulator</value></property>
+      <property><name>config_function</name>              <value>configure_accumulator</value></property> 
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>Acc</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AccumulatorGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>FFT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FFTGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CropCombine</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CropAndCombineGadget</classname>
+    </gadget>
+    -->
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>  
+  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/octave/octave/accumulator.m b/gadgets/octave/octave/accumulator.m
new file mode 100644
index 0000000..daca008
--- /dev/null
+++ b/gadgets/octave/octave/accumulator.m
@@ -0,0 +1,29 @@
+function accumulator(head, data)
+  global accumulator_calling_gadget;
+  global accumulator_next_gadget;
+  global accumulator_buffer;
+  global accumulator_center_line;
+
+   line_offset = bitshift(size(accumulator_buffer,2),-1) - accumulator_center_line;
+   %size(accumulator_buffer(:,head.idx.kspace_encode_step_1+line_offset+1,1,:))
+   %size(reshape(data,[size(data,1),1,1,size(data,2))])
+   %size(data)
+   
+   accumulator_buffer(:,head.idx.kspace_encode_step_1+line_offset+1,1,:) = reshape(data,size(data,1),1,1,size(data,2));
+   
+  if (bitand(head.flags, bitshift(1,7)) > 0),
+
+
+    img = ismrm_transform_kspace_to_image(accumulator_buffer,[1,2,3]);
+    img = sqrt(sum(abs(img).^2,4));
+
+    img_head = struct();
+  
+    img_head.version = h.version;
+    img_head.flags = 0;
+    img_head.measurement_uid = h.measurement_uid;
+    img_head.matrix_size = [size(img,1),size(img,2),size(img,3)];
+    img_head.channels = 1; 
+    GadgetronReturnIsmrmrdImage(accumulator_next_gadget, head, single(img));
+  end
+end
diff --git a/gadgets/octave/octave/configure_accumulator.m b/gadgets/octave/octave/configure_accumulator.m
new file mode 100644
index 0000000..8bea5b8
--- /dev/null
+++ b/gadgets/octave/octave/configure_accumulator.m
@@ -0,0 +1,18 @@
+function configure_accumulator(XMLconfig)
+   global accumulator_buffer
+   global accumulator_center_line 
+
+   matrix_size = [str2num(XMLGetXPath(XMLconfig, '//ismrmrdHeader/encoding/reconSpace/matrixSize/x')), ...
+			 str2num(XMLGetXPath(XMLconfig, '//ismrmrdHeader/encoding/reconSpace/matrixSize/y')), ...
+			 str2num(XMLGetXPath(XMLconfig, '//ismrmrdHeader/encoding/reconSpace/matrixSize/z')), ...
+                         str2num(XMLGetXPath(XMLconfig, '//ismrmrdHeader/acquisitionSystemInformation/receiverChannels'))];
+
+
+   accumulator_center_line = str2num(XMLGetXPath(XMLconfig, '//ismrmrdHeader/encoding/encodingLimits/kspace_encoding_step_1/center'));
+
+   fprintf('Accumulator: Reconstructing on matrix [%d, %d, %d]\n', matrix_size(1), matrix_size(2), matrix_size(3));
+
+
+   accumulator_buffer = single(zeros(matrix_size));
+   
+end
diff --git a/gadgets/octave/octave/configure_downsample_2x.m b/gadgets/octave/octave/configure_downsample_2x.m
new file mode 100644
index 0000000..688a8ca
--- /dev/null
+++ b/gadgets/octave/octave/configure_downsample_2x.m
@@ -0,0 +1,3 @@
+function configure_downsample_2x(XMLconfig)
+ fprintf("Skipping configuration for downsampling\n");
+end
diff --git a/gadgets/octave/octave/downsample_2x.m b/gadgets/octave/octave/downsample_2x.m
new file mode 100644
index 0000000..67a8a40
--- /dev/null
+++ b/gadgets/octave/octave/downsample_2x.m
@@ -0,0 +1,11 @@
+function downsample_2x(head, data)
+  global downsample_2x_calling_gadget;
+  global downsample_2x_next_gadget;
+
+  data = ismrm_transform_kspace_to_image(data,1);
+  data = data([1:bitshift(size(data,1),-1)]+bitshift(size(data,1),-2),:);
+  data = ismrm_transform_image_to_kspace(data,1);
+  head.number_of_samples = size(data,1);
+
+  GadgetronReturnIsmrmrdAcquisition(downsample_2x_next_gadget, head, single(data));
+end
diff --git a/gadgets/octave/octave/gadget_reference_accumulator.m b/gadgets/octave/octave/gadget_reference_accumulator.m
new file mode 100644
index 0000000..bc25de7
--- /dev/null
+++ b/gadgets/octave/octave/gadget_reference_accumulator.m
@@ -0,0 +1,7 @@
+function gadget_reference_accumulator(calling_gadget,next_gadget)
+  global accumulator_calling_gadget;
+  global accumulator_next_gadget;
+	
+  accumulator_calling_gadget = calling_gadget
+  accumulator_next_gadget = next_gadget
+end
diff --git a/gadgets/octave/octave/gadget_reference_downsample_2x.m b/gadgets/octave/octave/gadget_reference_downsample_2x.m
new file mode 100644
index 0000000..08ad380
--- /dev/null
+++ b/gadgets/octave/octave/gadget_reference_downsample_2x.m
@@ -0,0 +1,7 @@
+function gadget_reference_downsample_2x(calling_gadget,next_gadget)
+  global downsample_2x_calling_gadget;
+  global downsample_2x_next_gadget;
+	
+  downsample_2x_calling_gadget = calling_gadget
+  downsample_2x_next_gadget = next_gadget
+end
diff --git a/gadgets/octave/octave/ismrm_transform_image_to_kspace.m b/gadgets/octave/octave/ismrm_transform_image_to_kspace.m
new file mode 100644
index 0000000..2c22f78
--- /dev/null
+++ b/gadgets/octave/octave/ismrm_transform_image_to_kspace.m
@@ -0,0 +1,34 @@
+function [k] = ismrm_transform_image_to_kspace(img, dim)
+%
+%  [k] = ismrm_transform_image_to_kspace(img, dim)
+%
+%  Fourier transform from image space to k-space space along a given or all 
+%  dimensions
+%
+%  INPUT:
+%    - img     [x,y,..]      : image space data
+%    - dim     vector        : Vector with dimensions to transform
+%
+%  OUPUT:
+%    - k       [kx,ky,...]   : Data in k-space (along transformed dimensions)
+%
+%   Code made available for the ISMRM 2013 Sunrise Educational Course
+% 
+%   Michael S. Hansen (michael.hansen at nih.gov)
+%   Philip Beatty (philip.beatty at sri.utoronto.ca)
+%
+
+if nargin < 2,
+    dim = [];
+end    
+   
+if isempty(dim),
+    k = fftshift(fftn(ifftshift(img))) ./ sqrt(numel(img));
+else
+   k = img;
+   for d=1:length(dim),
+      k = fftshift(fft(ifftshift(k,dim(d)),[],dim(d)),dim(d)) ./ sqrt(size(k,d)); 
+   end
+end
+
+return
\ No newline at end of file
diff --git a/gadgets/octave/octave/ismrm_transform_kspace_to_image.m b/gadgets/octave/octave/ismrm_transform_kspace_to_image.m
new file mode 100644
index 0000000..a8ce058
--- /dev/null
+++ b/gadgets/octave/octave/ismrm_transform_kspace_to_image.m
@@ -0,0 +1,35 @@
+function [img] = ismrm_transform_kspace_to_image(k, dim)
+%
+%  [img] = ismrm_transform_kspace_to_image(k, dim)
+%
+%  Fourier transform from k-space to image space along a given or all 
+%  dimensions
+%
+%  INPUT:
+%    - k       [kx,ky,..]    : k-space data
+%    - dim     vector        : Vector with dimensions to transform
+%
+%  OUPUT:
+%    - img    [x,y,...]      : Data in image space (along transformed
+%                                                   dimensions)
+%
+%   Code made available for the ISMRM 2013 Sunrise Educational Course
+% 
+%   Michael S. Hansen (michael.hansen at nih.gov)
+%   Philip Beatty (philip.beatty at sri.utoronto.ca)
+%
+
+if nargin < 2,
+    dim = [];
+end    
+   
+if isempty(dim),
+    img = fftshift(ifftn(ifftshift(k))) .* sqrt(numel(k));
+else
+   img = k;
+   for d=1:length(dim),
+      img = fftshift(ifft(ifftshift(img,dim(d)),[],dim(d)),dim(d)) .* sqrt(size(img,d)); 
+   end
+end
+
+return
\ No newline at end of file
diff --git a/gadgets/octave/octave/my_config_function.m b/gadgets/octave/octave/my_config_function.m
new file mode 100644
index 0000000..1a05ad7
--- /dev/null
+++ b/gadgets/octave/octave/my_config_function.m
@@ -0,0 +1,3 @@
+function my_config_function(XMLconfig)
+   XMLconfig
+end
diff --git a/gadgets/octave/octave/my_gadget_reference.m b/gadgets/octave/octave/my_gadget_reference.m
new file mode 100644
index 0000000..b1ce45e
--- /dev/null
+++ b/gadgets/octave/octave/my_gadget_reference.m
@@ -0,0 +1,7 @@
+function my_gadget_reference(calling_gadget,next_gadget)
+  global my_calling_gadget;
+  global my_next_gadget;
+	
+  my_calling_gadget = calling_gadget
+  my_next_gadget = next_gadget
+end
diff --git a/gadgets/octave/octave/my_recon_function.m b/gadgets/octave/octave/my_recon_function.m
new file mode 100644
index 0000000..c092a76
--- /dev/null
+++ b/gadgets/octave/octave/my_recon_function.m
@@ -0,0 +1,8 @@
+function my_recon_function(head, data)
+  global my_calling_gadget;
+  global my_next_gadget;
+  f = hamming(size(data,1));
+  f = repmat(f,1,size(data,2));
+  data = data .* f;
+  GadgetronReturnIsmrmrdAcquisition(my_next_gadget, head, data);
+end
diff --git a/gadgets/octave/pugiconfig.hpp b/gadgets/octave/pugiconfig.hpp
new file mode 100644
index 0000000..c219671
--- /dev/null
+++ b/gadgets/octave/pugiconfig.hpp
@@ -0,0 +1,69 @@
+/**
+ * pugixml parser - version 1.2
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+// #include "pugixml.cpp"
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+#endif
+
+/**
+ * Copyright (c) 2006-2012 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/gadgets/octave/pugixml.cpp b/gadgets/octave/pugixml.cpp
new file mode 100644
index 0000000..4035ab1
--- /dev/null
+++ b/gadgets/octave/pugixml.cpp
@@ -0,0 +1,10250 @@
+/**
+ * pugixml parser - version 1.2
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
+ */
+
+#ifndef SOURCE_PUGIXML_CPP
+#define SOURCE_PUGIXML_CPP
+
+#include "pugixml.hpp"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <wchar.h>
+
+#ifndef PUGIXML_NO_XPATH
+#	include <math.h>
+#	include <float.h>
+#	ifdef PUGIXML_NO_EXCEPTIONS
+#		include <setjmp.h>
+#	endif
+#endif
+
+#ifndef PUGIXML_NO_STL
+#	include <istream>
+#	include <ostream>
+#	include <string>
+#endif
+
+// For placement new
+#include <new>
+
+#ifdef _MSC_VER
+#	pragma warning(push)
+#	pragma warning(disable: 4127) // conditional expression is constant
+#	pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+#	pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
+#	pragma warning(disable: 4702) // unreachable code
+#	pragma warning(disable: 4996) // this function or variable may be unsafe
+#	pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#ifdef __INTEL_COMPILER
+#	pragma warning(disable: 177) // function was declared but never referenced 
+#	pragma warning(disable: 279) // controlling expression is constant
+#	pragma warning(disable: 1478 1786) // function was declared "deprecated"
+#	pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
+#endif
+
+#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
+#	pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
+#endif
+
+#ifdef __BORLANDC__
+#	pragma option push
+#	pragma warn -8008 // condition is always false
+#	pragma warn -8066 // unreachable code
+#endif
+
+#ifdef __SNC__
+// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
+#	pragma diag_suppress=178 // function was declared but never referenced
+#	pragma diag_suppress=237 // controlling expression is constant
+#endif
+
+// Inlining controls
+#if defined(_MSC_VER) && _MSC_VER >= 1300
+#	define PUGI__NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+#	define PUGI__NO_INLINE __attribute__((noinline))
+#else
+#	define PUGI__NO_INLINE 
+#endif
+
+// Simple static assertion
+#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
+
+// Digital Mars C++ bug workaround for passing char loaded from memory via stack
+#ifdef __DMC__
+#	define PUGI__DMC_VOLATILE volatile
+#else
+#	define PUGI__DMC_VOLATILE
+#endif
+
+// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
+#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
+using std::memcpy;
+using std::memmove;
+#endif
+
+// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
+#if defined(_MSC_VER) && !defined(__S3E__)
+#	define PUGI__MSVC_CRT_VERSION _MSC_VER
+#endif
+
+#ifdef PUGIXML_HEADER_ONLY
+#	define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#	define PUGI__NS_END } }
+#	define PUGI__FN inline
+#	define PUGI__FN_NO_INLINE inline
+#else
+#	if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
+#		define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#		define PUGI__NS_END } }
+#	else
+#		define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
+#		define PUGI__NS_END } } }
+#	endif
+#	define PUGI__FN
+#	define PUGI__FN_NO_INLINE PUGI__NO_INLINE
+#endif
+
+// uintptr_t
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+#	include <stdint.h>
+#else
+#	ifndef _UINTPTR_T_DEFINED
+// No native uintptr_t in MSVC6 and in some WinCE versions
+typedef size_t uintptr_t;
+#define _UINTPTR_T_DEFINED
+#	endif
+PUGI__NS_BEGIN
+	typedef unsigned __int8 uint8_t;
+	typedef unsigned __int16 uint16_t;
+	typedef unsigned __int32 uint32_t;
+PUGI__NS_END
+#endif
+
+// Memory allocation
+PUGI__NS_BEGIN
+	PUGI__FN void* default_allocate(size_t size)
+	{
+		return malloc(size);
+	}
+
+	PUGI__FN void default_deallocate(void* ptr)
+	{
+		free(ptr);
+	}
+
+	template <typename T>
+	struct xml_memory_management_function_storage
+	{
+		static allocation_function allocate;
+		static deallocation_function deallocate;
+	};
+
+	template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+	template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
+
+	typedef xml_memory_management_function_storage<int> xml_memory;
+PUGI__NS_END
+
+// String utilities
+PUGI__NS_BEGIN
+	// Get string length
+	PUGI__FN size_t strlength(const char_t* s)
+	{
+		assert(s);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcslen(s);
+	#else
+		return strlen(s);
+	#endif
+	}
+
+	// Compare two strings
+	PUGI__FN bool strequal(const char_t* src, const char_t* dst)
+	{
+		assert(src && dst);
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcscmp(src, dst) == 0;
+	#else
+		return strcmp(src, dst) == 0;
+	#endif
+	}
+
+	// Compare lhs with [rhs_begin, rhs_end)
+	PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
+	{
+		for (size_t i = 0; i < count; ++i)
+			if (lhs[i] != rhs[i])
+				return false;
+	
+		return lhs[count] == 0;
+	}
+	
+#ifdef PUGIXML_WCHAR_MODE
+	// Convert string to wide string, assuming all symbols are ASCII
+	PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
+	{
+		for (const char* i = source; *i; ++i) *dest++ = *i;
+		*dest = 0;
+	}
+#endif
+PUGI__NS_END
+
+#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
+// auto_ptr-like buffer holder for exception recovery
+PUGI__NS_BEGIN
+	struct buffer_holder
+	{
+		void* data;
+		void (*deleter)(void*);
+
+		buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+		{
+		}
+
+		~buffer_holder()
+		{
+			if (data) deleter(data);
+		}
+
+		void* release()
+		{
+			void* result = data;
+			data = 0;
+			return result;
+		}
+	};
+PUGI__NS_END
+#endif
+
+PUGI__NS_BEGIN
+	static const size_t xml_memory_page_size =
+	#ifdef PUGIXML_MEMORY_PAGE_SIZE
+		PUGIXML_MEMORY_PAGE_SIZE
+	#else
+		32768
+	#endif
+		;
+
+	static const uintptr_t xml_memory_page_alignment = 32;
+	static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
+	static const uintptr_t xml_memory_page_name_allocated_mask = 16;
+	static const uintptr_t xml_memory_page_value_allocated_mask = 8;
+	static const uintptr_t xml_memory_page_type_mask = 7;
+
+	struct xml_allocator;
+
+	struct xml_memory_page
+	{
+		static xml_memory_page* construct(void* memory)
+		{
+			if (!memory) return 0; //$ redundant, left for performance
+
+			xml_memory_page* result = static_cast<xml_memory_page*>(memory);
+
+			result->allocator = 0;
+			result->memory = 0;
+			result->prev = 0;
+			result->next = 0;
+			result->busy_size = 0;
+			result->freed_size = 0;
+
+			return result;
+		}
+
+		xml_allocator* allocator;
+
+		void* memory;
+
+		xml_memory_page* prev;
+		xml_memory_page* next;
+
+		size_t busy_size;
+		size_t freed_size;
+
+		char data[1];
+	};
+
+	struct xml_memory_string_header
+	{
+		uint16_t page_offset; // offset from page->data
+		uint16_t full_size; // 0 if string occupies whole page
+	};
+
+	struct xml_allocator
+	{
+		xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
+		{
+		}
+
+		xml_memory_page* allocate_page(size_t data_size)
+		{
+			size_t size = offsetof(xml_memory_page, data) + data_size;
+
+			// allocate block with some alignment, leaving memory for worst-case padding
+			void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+			if (!memory) return 0;
+
+			// align upwards to page boundary
+			void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(memory) + (xml_memory_page_alignment - 1)) & ~(xml_memory_page_alignment - 1));
+
+			// prepare page structure
+			xml_memory_page* page = xml_memory_page::construct(page_memory);
+
+			page->memory = memory;
+			page->allocator = _root->allocator;
+
+			return page;
+		}
+
+		static void deallocate_page(xml_memory_page* page)
+		{
+			xml_memory::deallocate(page->memory);
+		}
+
+		void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
+
+		void* allocate_memory(size_t size, xml_memory_page*& out_page)
+		{
+			if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+
+			void* buf = _root->data + _busy_size;
+
+			_busy_size += size;
+
+			out_page = _root;
+
+			return buf;
+		}
+
+		void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
+		{
+			if (page == _root) page->busy_size = _busy_size;
+
+			assert(ptr >= page->data && ptr < page->data + page->busy_size);
+			(void)!ptr;
+
+			page->freed_size += size;
+			assert(page->freed_size <= page->busy_size);
+
+			if (page->freed_size == page->busy_size)
+			{
+				if (page->next == 0)
+				{
+					assert(_root == page);
+
+					// top page freed, just reset sizes
+					page->busy_size = page->freed_size = 0;
+					_busy_size = 0;
+				}
+				else
+				{
+					assert(_root != page);
+					assert(page->prev);
+
+					// remove from the list
+					page->prev->next = page->next;
+					page->next->prev = page->prev;
+
+					// deallocate
+					deallocate_page(page);
+				}
+			}
+		}
+
+		char_t* allocate_string(size_t length)
+		{
+			// allocate memory for string and header block
+			size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
+			
+			// round size up to pointer alignment boundary
+			size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+			xml_memory_page* page;
+			xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
+
+			if (!header) return 0;
+
+			// setup header
+			ptrdiff_t page_offset = reinterpret_cast<char*>(header) - page->data;
+
+			assert(page_offset >= 0 && page_offset < (1 << 16));
+			header->page_offset = static_cast<uint16_t>(page_offset);
+
+			// full_size == 0 for large strings that occupy the whole page
+			assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
+			header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
+
+			// round-trip through void* to avoid 'cast increases required alignment of target type' warning
+			// header is guaranteed a pointer-sized alignment, which should be enough for char_t
+			return static_cast<char_t*>(static_cast<void*>(header + 1));
+		}
+
+		void deallocate_string(char_t* string)
+		{
+			// this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+			// we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
+
+			// get header
+			xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
+
+			// deallocate
+			size_t page_offset = offsetof(xml_memory_page, data) + header->page_offset;
+			xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
+
+			// if full_size == 0 then this string occupies the whole page
+			size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+
+			deallocate_memory(header, full_size, page);
+		}
+
+		xml_memory_page* _root;
+		size_t _busy_size;
+	};
+
+	PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
+	{
+		const size_t large_allocation_threshold = xml_memory_page_size / 4;
+
+		xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+		out_page = page;
+
+		if (!page) return 0;
+
+		if (size <= large_allocation_threshold)
+		{
+			_root->busy_size = _busy_size;
+
+			// insert page at the end of linked list
+			page->prev = _root;
+			_root->next = page;
+			_root = page;
+
+			_busy_size = size;
+		}
+		else
+		{
+			// insert page before the end of linked list, so that it is deleted as soon as possible
+			// the last page is not deleted even if it's empty (see deallocate_memory)
+			assert(_root->prev);
+
+			page->prev = _root->prev;
+			page->next = _root;
+
+			_root->prev->next = page;
+			_root->prev = page;
+		}
+
+		// allocate inside page
+		page->busy_size = size;
+
+		return page->data;
+	}
+PUGI__NS_END
+
+namespace pugi
+{
+	/// A 'name=value' XML attribute structure.
+	struct xml_attribute_struct
+	{
+		/// Default ctor
+		xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+		{
+		}
+
+		uintptr_t header;
+
+		char_t* name;	///< Pointer to attribute name.
+		char_t*	value;	///< Pointer to attribute value.
+
+		xml_attribute_struct* prev_attribute_c;	///< Previous attribute (cyclic list)
+		xml_attribute_struct* next_attribute;	///< Next attribute
+	};
+
+	/// An XML document tree node.
+	struct xml_node_struct
+	{
+		/// Default ctor
+		/// \param type - node type
+		xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+		{
+		}
+
+		uintptr_t header;
+
+		xml_node_struct*		parent;					///< Pointer to parent
+
+		char_t*					name;					///< Pointer to element name.
+		char_t*					value;					///< Pointer to any associated string data.
+
+		xml_node_struct*		first_child;			///< First child
+		
+		xml_node_struct*		prev_sibling_c;			///< Left brother (cyclic list)
+		xml_node_struct*		next_sibling;			///< Right brother
+		
+		xml_attribute_struct*	first_attribute;		///< First attribute
+	};
+}
+
+PUGI__NS_BEGIN
+	struct xml_document_struct: public xml_node_struct, public xml_allocator
+	{
+		xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0)
+		{
+		}
+
+		const char_t* buffer;
+	};
+
+	inline xml_allocator& get_allocator(const xml_node_struct* node)
+	{
+		assert(node);
+
+		return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
+	}
+PUGI__NS_END
+
+// Low-level DOM operations
+PUGI__NS_BEGIN
+	inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
+	{
+		xml_memory_page* page;
+		void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+
+		return new (memory) xml_attribute_struct(page);
+	}
+
+	inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
+	{
+		xml_memory_page* page;
+		void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+
+		return new (memory) xml_node_struct(page, type);
+	}
+
+	inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
+	{
+		uintptr_t header = a->header;
+
+		if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
+		if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+
+		alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+	}
+
+	inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
+	{
+		uintptr_t header = n->header;
+
+		if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
+		if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+
+		for (xml_attribute_struct* attr = n->first_attribute; attr; )
+		{
+			xml_attribute_struct* next = attr->next_attribute;
+
+			destroy_attribute(attr, alloc);
+
+			attr = next;
+		}
+
+		for (xml_node_struct* child = n->first_child; child; )
+		{
+			xml_node_struct* next = child->next_sibling;
+
+			destroy_node(child, alloc);
+
+			child = next;
+		}
+
+		alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+	}
+
+	PUGI__FN_NO_INLINE xml_node_struct* append_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
+	{
+		xml_node_struct* child = allocate_node(alloc, type);
+		if (!child) return 0;
+
+		child->parent = node;
+
+		xml_node_struct* first_child = node->first_child;
+			
+		if (first_child)
+		{
+			xml_node_struct* last_child = first_child->prev_sibling_c;
+
+			last_child->next_sibling = child;
+			child->prev_sibling_c = last_child;
+			first_child->prev_sibling_c = child;
+		}
+		else
+		{
+			node->first_child = child;
+			child->prev_sibling_c = child;
+		}
+			
+		return child;
+	}
+
+	PUGI__FN_NO_INLINE xml_attribute_struct* append_attribute_ll(xml_node_struct* node, xml_allocator& alloc)
+	{
+		xml_attribute_struct* a = allocate_attribute(alloc);
+		if (!a) return 0;
+
+		xml_attribute_struct* first_attribute = node->first_attribute;
+
+		if (first_attribute)
+		{
+			xml_attribute_struct* last_attribute = first_attribute->prev_attribute_c;
+
+			last_attribute->next_attribute = a;
+			a->prev_attribute_c = last_attribute;
+			first_attribute->prev_attribute_c = a;
+		}
+		else
+		{
+			node->first_attribute = a;
+			a->prev_attribute_c = a;
+		}
+			
+		return a;
+	}
+PUGI__NS_END
+
+// Helper classes for code generation
+PUGI__NS_BEGIN
+	struct opt_false
+	{
+		enum { value = 0 };
+	};
+
+	struct opt_true
+	{
+		enum { value = 1 };
+	};
+PUGI__NS_END
+
+// Unicode utilities
+PUGI__NS_BEGIN
+	inline uint16_t endian_swap(uint16_t value)
+	{
+		return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
+	}
+
+	inline uint32_t endian_swap(uint32_t value)
+	{
+		return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
+	}
+
+	struct utf8_counter
+	{
+		typedef size_t value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			// U+0000..U+007F
+			if (ch < 0x80) return result + 1;
+			// U+0080..U+07FF
+			else if (ch < 0x800) return result + 2;
+			// U+0800..U+FFFF
+			else return result + 3;
+		}
+
+		static value_type high(value_type result, uint32_t)
+		{
+			// U+10000..U+10FFFF
+			return result + 4;
+		}
+	};
+
+	struct utf8_writer
+	{
+		typedef uint8_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			// U+0000..U+007F
+			if (ch < 0x80)
+			{
+				*result = static_cast<uint8_t>(ch);
+				return result + 1;
+			}
+			// U+0080..U+07FF
+			else if (ch < 0x800)
+			{
+				result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
+				result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+				return result + 2;
+			}
+			// U+0800..U+FFFF
+			else
+			{
+				result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
+				result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+				result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+				return result + 3;
+			}
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			// U+10000..U+10FFFF
+			result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
+			result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
+			result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+			result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+			return result + 4;
+		}
+
+		static value_type any(value_type result, uint32_t ch)
+		{
+			return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+		}
+	};
+
+	struct utf16_counter
+	{
+		typedef size_t value_type;
+
+		static value_type low(value_type result, uint32_t)
+		{
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t)
+		{
+			return result + 2;
+		}
+	};
+
+	struct utf16_writer
+	{
+		typedef uint16_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			*result = static_cast<uint16_t>(ch);
+
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
+			uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
+
+			result[0] = static_cast<uint16_t>(0xD800 + msh);
+			result[1] = static_cast<uint16_t>(0xDC00 + lsh);
+
+			return result + 2;
+		}
+
+		static value_type any(value_type result, uint32_t ch)
+		{
+			return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+		}
+	};
+
+	struct utf32_counter
+	{
+		typedef size_t value_type;
+
+		static value_type low(value_type result, uint32_t)
+		{
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t)
+		{
+			return result + 1;
+		}
+	};
+
+	struct utf32_writer
+	{
+		typedef uint32_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			*result = ch;
+
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			*result = ch;
+
+			return result + 1;
+		}
+
+		static value_type any(value_type result, uint32_t ch)
+		{
+			*result = ch;
+
+			return result + 1;
+		}
+	};
+
+	struct latin1_writer
+	{
+		typedef uint8_t* value_type;
+
+		static value_type low(value_type result, uint32_t ch)
+		{
+			*result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
+
+			return result + 1;
+		}
+
+		static value_type high(value_type result, uint32_t ch)
+		{
+			(void)ch;
+
+			*result = '?';
+
+			return result + 1;
+		}
+	};
+
+	template <size_t size> struct wchar_selector;
+
+	template <> struct wchar_selector<2>
+	{
+		typedef uint16_t type;
+		typedef utf16_counter counter;
+		typedef utf16_writer writer;
+	};
+
+	template <> struct wchar_selector<4>
+	{
+		typedef uint32_t type;
+		typedef utf32_counter counter;
+		typedef utf32_writer writer;
+	};
+
+	typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
+	typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
+
+	template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
+	{
+		static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+		{
+			const uint8_t utf8_byte_mask = 0x3f;
+
+			while (size)
+			{
+				uint8_t lead = *data;
+
+				// 0xxxxxxx -> U+0000..U+007F
+				if (lead < 0x80)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+					size -= 1;
+
+					// process aligned single-byte (ascii) blocks
+					if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
+					{
+						// round-trip through void* to silence 'cast increases required alignment of target type' warnings
+						while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
+						{
+							result = Traits::low(result, data[0]);
+							result = Traits::low(result, data[1]);
+							result = Traits::low(result, data[2]);
+							result = Traits::low(result, data[3]);
+							data += 4;
+							size -= 4;
+						}
+					}
+				}
+				// 110xxxxx -> U+0080..U+07FF
+				else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
+				{
+					result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
+					data += 2;
+					size -= 2;
+				}
+				// 1110xxxx -> U+0800-U+FFFF
+				else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
+				{
+					result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
+					data += 3;
+					size -= 3;
+				}
+				// 11110xxx -> U+10000..U+10FFFF
+				else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
+				{
+					result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
+					data += 4;
+					size -= 4;
+				}
+				// 10xxxxxx or 11111xxx -> invalid
+				else
+				{
+					data += 1;
+					size -= 1;
+				}
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
+		{
+			const uint16_t* end = data + size;
+
+			while (data < end)
+			{
+				uint16_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+				// U+0000..U+D7FF
+				if (lead < 0xD800)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+				}
+				// U+E000..U+FFFF
+				else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+				}
+				// surrogate pair lead
+				else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
+				{
+					uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
+
+					if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
+					{
+						result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
+						data += 2;
+					}
+					else
+					{
+						data += 1;
+					}
+				}
+				else
+				{
+					data += 1;
+				}
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
+		{
+			const uint32_t* end = data + size;
+
+			while (data < end)
+			{
+				uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+				// U+0000..U+FFFF
+				if (lead < 0x10000)
+				{
+					result = Traits::low(result, lead);
+					data += 1;
+				}
+				// U+10000..U+10FFFF
+				else
+				{
+					result = Traits::high(result, lead);
+					data += 1;
+				}
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+		{
+			for (size_t i = 0; i < size; ++i)
+			{
+				result = Traits::low(result, data[i]);
+			}
+
+			return result;
+		}
+
+		static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
+		{
+			return decode_utf16_block(data, size, result);
+		}
+
+		static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
+		{
+			return decode_utf32_block(data, size, result);
+		}
+
+		static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
+		{
+			return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
+		}
+	};
+
+	template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+	{
+		for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
+	{
+		for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
+	}
+#endif
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+	enum chartype_t
+	{
+		ct_parse_pcdata = 1,	// \0, &, \r, <
+		ct_parse_attr = 2,		// \0, &, \r, ', "
+		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, tab
+		ct_space = 8,			// \r, \n, space, tab
+		ct_parse_cdata = 16,	// \0, ], >, \r
+		ct_parse_comment = 32,	// \0, -, >, \r
+		ct_symbol = 64,			// Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+		ct_start_symbol = 128	// Any symbol > 127, a-z, A-Z, _, :
+	};
+
+	static const unsigned char chartype_table[256] =
+	{
+		55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
+		0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
+		8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
+		64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
+		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
+		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
+
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
+	};
+
+	enum chartypex_t
+	{
+		ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
+		ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
+		ctx_start_symbol = 4,	  // Any symbol > 127, a-z, A-Z, _
+		ctx_digit = 8,			  // 0-9
+		ctx_symbol = 16			  // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
+	};
+	
+	static const unsigned char chartypex_table[256] =
+	{
+		3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
+		3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
+		0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
+		24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
+
+		0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
+		0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
+
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
+	};
+	
+#ifdef PUGIXML_WCHAR_MODE
+	#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
+#else
+	#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
+#endif
+
+	#define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
+	#define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
+
+	PUGI__FN bool is_little_endian()
+	{
+		unsigned int ui = 1;
+
+		return *reinterpret_cast<unsigned char*>(&ui) == 1;
+	}
+
+	PUGI__FN xml_encoding get_wchar_encoding()
+	{
+		PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+		if (sizeof(wchar_t) == 2)
+			return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+		else 
+			return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+	}
+
+	PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+	{
+		// look for BOM in first few bytes
+		if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
+		if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
+		if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
+		if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
+		if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
+
+		// look for <, <? or <?xm in various encodings
+		if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
+		if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
+		if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
+		if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
+		if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
+
+		// look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
+		if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
+		if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
+
+		// no known BOM detected, assume utf8
+		return encoding_utf8;
+	}
+
+	PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
+	{
+		// replace wchar encoding with utf implementation
+		if (encoding == encoding_wchar) return get_wchar_encoding();
+
+		// replace utf16 encoding with utf16 with specific endianness
+		if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+		// replace utf32 encoding with utf32 with specific endianness
+		if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+		// only do autodetection if no explicit encoding is requested
+		if (encoding != encoding_auto) return encoding;
+
+		// skip encoding autodetection if input buffer is too small
+		if (size < 4) return encoding_utf8;
+
+		// try to guess encoding (based on XML specification, Appendix F.1)
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+		PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
+		return guess_buffer_encoding(d0, d1, d2, d3);
+	}
+
+	PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+	{
+		if (is_mutable)
+		{
+			out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
+		}
+		else
+		{
+			void* buffer = xml_memory::allocate(size > 0 ? size : 1);
+			if (!buffer) return false;
+
+			memcpy(buffer, contents, size);
+
+			out_buffer = static_cast<char_t*>(buffer);
+		}
+
+		out_length = size / sizeof(char_t);
+
+		return true;
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
+	{
+		return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
+			   (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
+	}
+
+	PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+	{
+		const char_t* data = static_cast<const char_t*>(contents);
+	
+		if (is_mutable)
+		{
+			out_buffer = const_cast<char_t*>(data);
+		}
+		else
+		{
+			out_buffer = static_cast<char_t*>(xml_memory::allocate(size > 0 ? size : 1));
+			if (!out_buffer) return false;
+		}
+
+		out_length = size / sizeof(char_t);
+
+		convert_wchar_endian_swap(out_buffer, data, out_length);
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+	{
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+		// first pass: get length in wchar_t units
+		out_length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// second pass: convert utf8 input to wchar_t
+		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+		wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, out_begin);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint16_t* data = static_cast<const uint16_t*>(contents);
+		size_t length = size / sizeof(uint16_t);
+
+		// first pass: get length in wchar_t units
+		out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, length, 0);
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// second pass: convert utf16 input to wchar_t
+		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+		wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint32_t* data = static_cast<const uint32_t*>(contents);
+		size_t length = size / sizeof(uint32_t);
+
+		// first pass: get length in wchar_t units
+		out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, length, 0);
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// second pass: convert utf32 input to wchar_t
+		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+		wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+	{
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+		// get length in wchar_t units
+		out_length = size;
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// convert latin1 input to wchar_t
+		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
+		wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_latin1_block(data, size, out_begin);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+	{
+		// get native encoding
+		xml_encoding wchar_encoding = get_wchar_encoding();
+
+		// fast path: no conversion required
+		if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+		// only endian-swapping is required
+		if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+
+		// source encoding is utf8
+		if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+
+		// source encoding is utf16
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is utf32
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is latin1
+		if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+
+		assert(!"Invalid encoding");
+		return false;
+	}
+#else
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint16_t* data = static_cast<const uint16_t*>(contents);
+		size_t length = size / sizeof(uint16_t);
+
+		// first pass: get length in utf8 units
+		out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, length, 0);
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// second pass: convert utf16 input to utf8
+		uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
+		uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+	{
+		const uint32_t* data = static_cast<const uint32_t*>(contents);
+		size_t length = size / sizeof(uint32_t);
+
+		// first pass: get length in utf8 units
+		out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, length, 0);
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// second pass: convert utf32 input to utf8
+		uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
+		uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
+	{
+		for (size_t i = 0; i < size; ++i)
+			if (data[i] > 127)
+				return i;
+
+		return size;
+	}
+
+	PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+	{
+		const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+		// get size of prefix that does not need utf8 conversion
+		size_t prefix_length = get_latin1_7bit_prefix_length(data, size);
+		assert(prefix_length <= size);
+
+		const uint8_t* postfix = data + prefix_length;
+		size_t postfix_length = size - prefix_length;
+
+		// if no conversion is needed, just return the original buffer
+		if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+		// first pass: get length in utf8 units
+		out_length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
+
+		// allocate buffer of suitable length
+		out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
+		if (!out_buffer) return false;
+
+		// second pass: convert latin1 input to utf8
+		memcpy(out_buffer, data, prefix_length);
+
+		uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
+		uint8_t* out_end = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, out_begin + prefix_length);
+
+		assert(out_end == out_begin + out_length);
+		(void)!out_end;
+
+		return true;
+	}
+
+	PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+	{
+		// fast path: no conversion required
+		if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+		// source encoding is utf16
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is utf32
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			return (native_encoding == encoding) ?
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+		}
+
+		// source encoding is latin1
+		if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+
+		assert(!"Invalid encoding");
+		return false;
+	}
+#endif
+
+	PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
+	{
+		// get length in utf8 characters
+		return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
+	}
+
+	PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+	{
+		// convert to utf8
+		uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+		uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
+	
+		assert(begin + size == end);
+		(void)!end;
+
+		// zero-terminate
+		buffer[size] = 0;
+	}
+	
+#ifndef PUGIXML_NO_STL
+	PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
+	{
+		// first pass: get length in utf8 characters
+		size_t size = as_utf8_begin(str, length);
+
+		// allocate resulting string
+		std::string result;
+		result.resize(size);
+
+		// second pass: convert to utf8
+		if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+		return result;
+	}
+
+	PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
+	{
+		const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+		// first pass: get length in wchar_t units
+		size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+		// allocate resulting string
+		std::basic_string<wchar_t> result;
+		result.resize(length);
+
+		// second pass: convert to wchar_t
+		if (length > 0)
+		{
+			wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+			wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+			assert(begin + length == end);
+			(void)!end;
+		}
+
+		return result;
+	}
+#endif
+
+	inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
+	{
+		assert(target);
+		size_t target_length = strlength(target);
+
+		// always reuse document buffer memory if possible
+		if (!allocated) return target_length >= length;
+
+		// reuse heap memory if waste is not too great
+		const size_t reuse_threshold = 32;
+
+		return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
+	}
+
+	PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+	{
+		size_t source_length = strlength(source);
+
+		if (source_length == 0)
+		{
+			// empty string and null pointer are equivalent, so just deallocate old memory
+			xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+			if (header & header_mask) alloc->deallocate_string(dest);
+			
+			// mark the string as not allocated
+			dest = 0;
+			header &= ~header_mask;
+
+			return true;
+		}
+		else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest))
+		{
+			// we can reuse old buffer, so just copy the new data (including zero terminator)
+			memcpy(dest, source, (source_length + 1) * sizeof(char_t));
+			
+			return true;
+		}
+		else
+		{
+			xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+			// allocate new buffer
+			char_t* buf = alloc->allocate_string(source_length + 1);
+			if (!buf) return false;
+
+			// copy the string (including zero terminator)
+			memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+
+			// deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
+			if (header & header_mask) alloc->deallocate_string(dest);
+			
+			// the string is now allocated, so set the flag
+			dest = buf;
+			header |= header_mask;
+
+			return true;
+		}
+	}
+
+	struct gap
+	{
+		char_t* end;
+		size_t size;
+			
+		gap(): end(0), size(0)
+		{
+		}
+			
+		// Push new gap, move s count bytes further (skipping the gap).
+		// Collapse previous gap.
+		void push(char_t*& s, size_t count)
+		{
+			if (end) // there was a gap already; collapse it
+			{
+				// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+				assert(s >= end);
+				memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+			}
+				
+			s += count; // end of current gap
+				
+			// "merge" two gaps
+			end = s;
+			size += count;
+		}
+			
+		// Collapse all gaps, return past-the-end pointer
+		char_t* flush(char_t* s)
+		{
+			if (end)
+			{
+				// Move [old_gap_end, current_pos) to [old_gap_start, ...)
+				assert(s >= end);
+				memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+
+				return s - size;
+			}
+			else return s;
+		}
+	};
+	
+	PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
+	{
+		char_t* stre = s + 1;
+
+		switch (*stre)
+		{
+			case '#':	// &#...
+			{
+				unsigned int ucsc = 0;
+
+				if (stre[1] == 'x') // &#x... (hex code)
+				{
+					stre += 2;
+
+					char_t ch = *stre;
+
+					if (ch == ';') return stre;
+
+					for (;;)
+					{
+						if (static_cast<unsigned int>(ch - '0') <= 9)
+							ucsc = 16 * ucsc + (ch - '0');
+						else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
+							ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
+						else if (ch == ';')
+							break;
+						else // cancel
+							return stre;
+
+						ch = *++stre;
+					}
+					
+					++stre;
+				}
+				else	// &#... (dec code)
+				{
+					char_t ch = *++stre;
+
+					if (ch == ';') return stre;
+
+					for (;;)
+					{
+						if (static_cast<unsigned int>(ch - '0') <= 9)
+							ucsc = 10 * ucsc + (ch - '0');
+						else if (ch == ';')
+							break;
+						else // cancel
+							return stre;
+
+						ch = *++stre;
+					}
+					
+					++stre;
+				}
+
+			#ifdef PUGIXML_WCHAR_MODE
+				s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
+			#else
+				s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
+			#endif
+					
+				g.push(s, stre - s);
+				return stre;
+			}
+
+			case 'a':	// &a
+			{
+				++stre;
+
+				if (*stre == 'm') // &am
+				{
+					if (*++stre == 'p' && *++stre == ';') // &
+					{
+						*s++ = '&';
+						++stre;
+							
+						g.push(s, stre - s);
+						return stre;
+					}
+				}
+				else if (*stre == 'p') // &ap
+				{
+					if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
+					{
+						*s++ = '\'';
+						++stre;
+
+						g.push(s, stre - s);
+						return stre;
+					}
+				}
+				break;
+			}
+
+			case 'g': // &g
+			{
+				if (*++stre == 't' && *++stre == ';') // >
+				{
+					*s++ = '>';
+					++stre;
+					
+					g.push(s, stre - s);
+					return stre;
+				}
+				break;
+			}
+
+			case 'l': // &l
+			{
+				if (*++stre == 't' && *++stre == ';') // <
+				{
+					*s++ = '<';
+					++stre;
+						
+					g.push(s, stre - s);
+					return stre;
+				}
+				break;
+			}
+
+			case 'q': // &q
+			{
+				if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
+				{
+					*s++ = '"';
+					++stre;
+					
+					g.push(s, stre - s);
+					return stre;
+				}
+				break;
+			}
+
+			default:
+				break;
+		}
+		
+		return stre;
+	}
+
+	// Utility macro for last character handling
+	#define ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
+
+	PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
+	{
+		gap g;
+		
+		while (true)
+		{
+			while (!PUGI__IS_CHARTYPE(*s, ct_parse_comment)) ++s;
+		
+			if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+			{
+				*s++ = '\n'; // replace first one with 0x0a
+				
+				if (*s == '\n') g.push(s, 1);
+			}
+			else if (s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')) // comment ends here
+			{
+				*g.flush(s) = 0;
+				
+				return s + (s[2] == '>' ? 3 : 2);
+			}
+			else if (*s == 0)
+			{
+				return 0;
+			}
+			else ++s;
+		}
+	}
+
+	PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
+	{
+		gap g;
+			
+		while (true)
+		{
+			while (!PUGI__IS_CHARTYPE(*s, ct_parse_cdata)) ++s;
+			
+			if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+			{
+				*s++ = '\n'; // replace first one with 0x0a
+				
+				if (*s == '\n') g.push(s, 1);
+			}
+			else if (s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')) // CDATA ends here
+			{
+				*g.flush(s) = 0;
+				
+				return s + 1;
+			}
+			else if (*s == 0)
+			{
+				return 0;
+			}
+			else ++s;
+		}
+	}
+	
+	typedef char_t* (*strconv_pcdata_t)(char_t*);
+		
+	template <typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+	{
+		static char_t* parse(char_t* s)
+		{
+			gap g;
+			
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_pcdata)) ++s;
+					
+				if (*s == '<') // PCDATA ends here
+				{
+					*g.flush(s) = 0;
+					
+					return s + 1;
+				}
+				else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+				{
+					*s++ = '\n'; // replace first one with 0x0a
+					
+					if (*s == '\n') g.push(s, 1);
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (*s == 0)
+				{
+					return s;
+				}
+				else ++s;
+			}
+		}
+	};
+	
+	PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
+	{
+		PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20);
+
+		switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes)
+		{
+		case 0: return strconv_pcdata_impl<opt_false, opt_false>::parse;
+		case 1: return strconv_pcdata_impl<opt_false, opt_true>::parse;
+		case 2: return strconv_pcdata_impl<opt_true, opt_false>::parse;
+		case 3: return strconv_pcdata_impl<opt_true, opt_true>::parse;
+		default: return 0; // should not get here
+		}
+	}
+
+	typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
+	
+	template <typename opt_escape> struct strconv_attribute_impl
+	{
+		static char_t* parse_wnorm(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			// trim leading whitespaces
+			if (PUGI__IS_CHARTYPE(*s, ct_space))
+			{
+				char_t* str = s;
+				
+				do ++str;
+				while (PUGI__IS_CHARTYPE(*str, ct_space));
+				
+				g.push(s, str - s);
+			}
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
+				
+				if (*s == end_quote)
+				{
+					char_t* str = g.flush(s);
+					
+					do *str-- = 0;
+					while (PUGI__IS_CHARTYPE(*str, ct_space));
+				
+					return s + 1;
+				}
+				else if (PUGI__IS_CHARTYPE(*s, ct_space))
+				{
+					*s++ = ' ';
+		
+					if (PUGI__IS_CHARTYPE(*s, ct_space))
+					{
+						char_t* str = s + 1;
+						while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
+						
+						g.push(s, str - s);
+					}
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_wconv(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (PUGI__IS_CHARTYPE(*s, ct_space))
+				{
+					if (*s == '\r')
+					{
+						*s++ = ' ';
+				
+						if (*s == '\n') g.push(s, 1);
+					}
+					else *s++ = ' ';
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_eol(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (*s == '\r')
+				{
+					*s++ = '\n';
+					
+					if (*s == '\n') g.push(s, 1);
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_simple(char_t* s, char_t end_quote)
+		{
+			gap g;
+
+			while (true)
+			{
+				while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (opt_escape::value && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+	};
+
+	PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
+	{
+		PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
+		
+		switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
+		{
+		case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
+		case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
+		case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
+		case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
+		case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
+		case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
+		case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
+		case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
+		case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
+		case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
+		case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
+		case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
+		case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
+		default: return 0; // should not get here
+		}
+	}
+
+	inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
+	{
+		xml_parse_result result;
+		result.status = status;
+		result.offset = offset;
+
+		return result;
+	}
+
+	struct xml_parser
+	{
+		xml_allocator alloc;
+		char_t* error_offset;
+		xml_parse_status error_status;
+		
+		// Parser utilities.
+		#define PUGI__SKIPWS()			{ while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
+		#define PUGI__OPTSET(OPT)			( optmsk & (OPT) )
+		#define PUGI__PUSHNODE(TYPE)		{ cursor = append_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+		#define PUGI__POPNODE()			{ cursor = cursor->parent; }
+		#define PUGI__SCANFOR(X)			{ while (*s != 0 && !(X)) ++s; }
+		#define PUGI__SCANWHILE(X)		{ while ((X)) ++s; }
+		#define PUGI__ENDSEG()			{ ch = *s; *s = 0; ++s; }
+		#define PUGI__THROW_ERROR(err, m)	return error_offset = m, error_status = err, static_cast<char_t*>(0)
+		#define PUGI__CHECK_ERROR(err, m)	{ if (*s == 0) PUGI__THROW_ERROR(err, m); }
+		
+		xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+		{
+		}
+
+		// DOCTYPE consists of nested sections of the following possible types:
+		// <!-- ... -->, <? ... ?>, "...", '...'
+		// <![...]]>
+		// <!...>
+		// First group can not contain nested groups
+		// Second group can contain nested groups of the same type
+		// Third group can contain all other groups
+		char_t* parse_doctype_primitive(char_t* s)
+		{
+			if (*s == '"' || *s == '\'')
+			{
+				// quoted string
+				char_t ch = *s++;
+				PUGI__SCANFOR(*s == ch);
+				if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				s++;
+			}
+			else if (s[0] == '<' && s[1] == '?')
+			{
+				// <? ... ?>
+				s += 2;
+				PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+				if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				s += 2;
+			}
+			else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+			{
+				s += 4;
+				PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+				if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				s += 4;
+			}
+			else PUGI__THROW_ERROR(status_bad_doctype, s);
+
+			return s;
+		}
+
+		char_t* parse_doctype_ignore(char_t* s)
+		{
+			assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+			s++;
+
+			while (*s)
+			{
+				if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+				{
+					// nested ignore section
+					s = parse_doctype_ignore(s);
+					if (!s) return s;
+				}
+				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+				{
+					// ignore section end
+					s += 3;
+
+					return s;
+				}
+				else s++;
+			}
+
+			PUGI__THROW_ERROR(status_bad_doctype, s);
+		}
+
+		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
+		{
+			assert(s[0] == '<' && s[1] == '!');
+			s++;
+
+			while (*s)
+			{
+				if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+				{
+					if (s[2] == '[')
+					{
+						// ignore
+						s = parse_doctype_ignore(s);
+						if (!s) return s;
+					}
+					else
+					{
+						// some control group
+						s = parse_doctype_group(s, endch, false);
+						if (!s) return s;
+					}
+				}
+				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+				{
+					// unknown tag (forbidden), or some primitive group
+					s = parse_doctype_primitive(s);
+					if (!s) return s;
+				}
+				else if (*s == '>')
+				{
+					s++;
+
+					return s;
+				}
+				else s++;
+			}
+
+			if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+
+			return s;
+		}
+
+		char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
+		{
+			// parse node contents, starting with exclamation mark
+			++s;
+
+			if (*s == '-') // '<!-...'
+			{
+				++s;
+
+				if (*s == '-') // '<!--...'
+				{
+					++s;
+
+					if (PUGI__OPTSET(parse_comments))
+					{
+						PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
+						cursor->value = s; // Save the offset.
+					}
+
+					if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
+					{
+						s = strconv_comment(s, endch);
+
+						if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
+					}
+					else
+					{
+						// Scan for terminating '-->'.
+						PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>'));
+						PUGI__CHECK_ERROR(status_bad_comment, s);
+
+						if (PUGI__OPTSET(parse_comments))
+							*s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+						s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
+					}
+				}
+				else PUGI__THROW_ERROR(status_bad_comment, s);
+			}
+			else if (*s == '[')
+			{
+				// '<![CDATA[...'
+				if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
+				{
+					++s;
+
+					if (PUGI__OPTSET(parse_cdata))
+					{
+						PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
+						cursor->value = s; // Save the offset.
+
+						if (PUGI__OPTSET(parse_eol))
+						{
+							s = strconv_cdata(s, endch);
+
+							if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
+						}
+						else
+						{
+							// Scan for terminating ']]>'.
+							PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
+							PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+							*s++ = 0; // Zero-terminate this segment.
+						}
+					}
+					else // Flagged for discard, but we still have to scan for the terminator.
+					{
+						// Scan for terminating ']]>'.
+						PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
+						PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+						++s;
+					}
+
+					s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
+				}
+				else PUGI__THROW_ERROR(status_bad_cdata, s);
+			}
+			else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
+			{
+				s -= 2;
+
+				if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+				char_t* mark = s + 9;
+
+				s = parse_doctype_group(s, endch, true);
+				if (!s) return s;
+
+				if (PUGI__OPTSET(parse_doctype))
+				{
+					while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
+
+					PUGI__PUSHNODE(node_doctype);
+
+					cursor->value = mark;
+
+					assert((s[0] == 0 && endch == '>') || s[-1] == '>');
+					s[*s == 0 ? 0 : -1] = 0;
+
+					PUGI__POPNODE();
+				}
+			}
+			else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
+			else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
+			else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+
+			return s;
+		}
+
+		char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
+		{
+			// load into registers
+			xml_node_struct* cursor = ref_cursor;
+			char_t ch = 0;
+
+			// parse node contents, starting with question mark
+			++s;
+
+			// read PI target
+			char_t* target = s;
+
+			if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
+
+			PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
+			PUGI__CHECK_ERROR(status_bad_pi, s);
+
+			// determine node type; stricmp / strcasecmp is not portable
+			bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
+
+			if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
+			{
+				if (declaration)
+				{
+					// disallow non top-level declarations
+					if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
+
+					PUGI__PUSHNODE(node_declaration);
+				}
+				else
+				{
+					PUGI__PUSHNODE(node_pi);
+				}
+
+				cursor->name = target;
+
+				PUGI__ENDSEG();
+
+				// parse value/attributes
+				if (ch == '?')
+				{
+					// empty node
+					if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
+					s += (*s == '>');
+
+					PUGI__POPNODE();
+				}
+				else if (PUGI__IS_CHARTYPE(ch, ct_space))
+				{
+					PUGI__SKIPWS();
+
+					// scan for tag end
+					char_t* value = s;
+
+					PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
+					PUGI__CHECK_ERROR(status_bad_pi, s);
+
+					if (declaration)
+					{
+						// replace ending ? with / so that 'element' terminates properly
+						*s = '/';
+
+						// we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
+						s = value;
+					}
+					else
+					{
+						// store value and step over >
+						cursor->value = value;
+						PUGI__POPNODE();
+
+						PUGI__ENDSEG();
+
+						s += (*s == '>');
+					}
+				}
+				else PUGI__THROW_ERROR(status_bad_pi, s);
+			}
+			else
+			{
+				// scan for tag end
+				PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
+				PUGI__CHECK_ERROR(status_bad_pi, s);
+
+				s += (s[1] == '>' ? 2 : 1);
+			}
+
+			// store from registers
+			ref_cursor = cursor;
+
+			return s;
+		}
+
+		char_t* parse(char_t* s, xml_node_struct* xmldoc, unsigned int optmsk, char_t endch)
+		{
+			strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
+			strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
+			
+			char_t ch = 0;
+			xml_node_struct* cursor = xmldoc;
+			char_t* mark = s;
+
+			while (*s != 0)
+			{
+				if (*s == '<')
+				{
+					++s;
+
+				LOC_TAG:
+					if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
+					{
+						PUGI__PUSHNODE(node_element); // Append a new node to the tree.
+
+						cursor->name = s;
+
+						PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
+						PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+						if (ch == '>')
+						{
+							// end of tag
+						}
+						else if (PUGI__IS_CHARTYPE(ch, ct_space))
+						{
+						LOC_ATTRIBUTES:
+							while (true)
+							{
+								PUGI__SKIPWS(); // Eat any whitespace.
+						
+								if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
+								{
+									xml_attribute_struct* a = append_attribute_ll(cursor, alloc); // Make space for this attribute.
+									if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
+
+									a->name = s; // Save the offset.
+
+									PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
+									PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+									PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+									PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+									if (PUGI__IS_CHARTYPE(ch, ct_space))
+									{
+										PUGI__SKIPWS(); // Eat any whitespace.
+										PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
+
+										ch = *s;
+										++s;
+									}
+									
+									if (ch == '=') // '<... #=...'
+									{
+										PUGI__SKIPWS(); // Eat any whitespace.
+
+										if (*s == '"' || *s == '\'') // '<... #="...'
+										{
+											ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
+											++s; // Step over the quote.
+											a->value = s; // Save the offset.
+
+											s = strconv_attribute(s, ch);
+										
+											if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
+
+											// After this line the loop continues from the start;
+											// Whitespaces, / and > are ok, symbols and EOF are wrong,
+											// everything else will be detected
+											if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
+										}
+										else PUGI__THROW_ERROR(status_bad_attribute, s);
+									}
+									else PUGI__THROW_ERROR(status_bad_attribute, s);
+								}
+								else if (*s == '/')
+								{
+									++s;
+									
+									if (*s == '>')
+									{
+										PUGI__POPNODE();
+										s++;
+										break;
+									}
+									else if (*s == 0 && endch == '>')
+									{
+										PUGI__POPNODE();
+										break;
+									}
+									else PUGI__THROW_ERROR(status_bad_start_element, s);
+								}
+								else if (*s == '>')
+								{
+									++s;
+
+									break;
+								}
+								else if (*s == 0 && endch == '>')
+								{
+									break;
+								}
+								else PUGI__THROW_ERROR(status_bad_start_element, s);
+							}
+
+							// !!!
+						}
+						else if (ch == '/') // '<#.../'
+						{
+							if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
+
+							PUGI__POPNODE(); // Pop.
+
+							s += (*s == '>');
+						}
+						else if (ch == 0)
+						{
+							// we stepped over null terminator, backtrack & handle closing tag
+							--s;
+							
+							if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
+						}
+						else PUGI__THROW_ERROR(status_bad_start_element, s);
+					}
+					else if (*s == '/')
+					{
+						++s;
+
+						char_t* name = cursor->name;
+						if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+						
+						while (PUGI__IS_CHARTYPE(*s, ct_symbol))
+						{
+							if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+						}
+
+						if (*name)
+						{
+							if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
+							else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+						}
+							
+						PUGI__POPNODE(); // Pop.
+
+						PUGI__SKIPWS();
+
+						if (*s == 0)
+						{
+							if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+						}
+						else
+						{
+							if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+							++s;
+						}
+					}
+					else if (*s == '?') // '<?...'
+					{
+						s = parse_question(s, cursor, optmsk, endch);
+						if (!s) return s;
+
+						assert(cursor);
+						if ((cursor->header & xml_memory_page_type_mask) + 1 == node_declaration) goto LOC_ATTRIBUTES;
+					}
+					else if (*s == '!') // '<!...'
+					{
+						s = parse_exclamation(s, cursor, optmsk, endch);
+						if (!s) return s;
+					}
+					else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
+					else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+				}
+				else
+				{
+					mark = s; // Save this offset while searching for a terminator.
+
+					PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
+
+					if (*s == '<')
+					{
+						// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
+						assert(mark != s);
+
+						if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single))
+						{
+							continue;
+						}
+						else if (PUGI__OPTSET(parse_ws_pcdata_single))
+						{
+							if (s[1] != '/' || cursor->first_child) continue;
+						}
+					}
+
+					s = mark;
+							
+					if (cursor->parent)
+					{
+						PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+						cursor->value = s; // Save the offset.
+
+						s = strconv_pcdata(s);
+								
+						PUGI__POPNODE(); // Pop since this is a standalone.
+						
+						if (!*s) break;
+					}
+					else
+					{
+						PUGI__SCANFOR(*s == '<'); // '...<'
+						if (!*s) break;
+						
+						++s;
+					}
+
+					// We're after '<'
+					goto LOC_TAG;
+				}
+			}
+
+			// check that last tag is closed
+			if (cursor != xmldoc) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+			return s;
+		}
+
+		static xml_parse_result parse(char_t* buffer, size_t length, xml_node_struct* root, unsigned int optmsk)
+		{
+			xml_document_struct* xmldoc = static_cast<xml_document_struct*>(root);
+
+			// store buffer for offset_debug
+			xmldoc->buffer = buffer;
+
+			// early-out for empty documents
+			if (length == 0) return make_parse_result(status_ok);
+
+			// create parser on stack
+			xml_parser parser(*xmldoc);
+
+			// save last character and make buffer zero-terminated (speeds up parsing)
+			char_t endch = buffer[length - 1];
+			buffer[length - 1] = 0;
+			
+			// perform actual parsing
+			parser.parse(buffer, xmldoc, optmsk, endch);
+
+			xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
+			assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
+
+			// update allocator state
+			*static_cast<xml_allocator*>(xmldoc) = parser.alloc;
+
+			// since we removed last character, we have to handle the only possible false positive
+			if (result && endch == '<')
+			{
+				// there's no possible well-formed document with < at the end
+				return make_parse_result(status_unrecognized_tag, length);
+			}
+
+			return result;
+		}
+	};
+
+	// Output facilities
+	PUGI__FN xml_encoding get_write_native_encoding()
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		return get_wchar_encoding();
+	#else
+		return encoding_utf8;
+	#endif
+	}
+
+	PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
+	{
+		// replace wchar encoding with utf implementation
+		if (encoding == encoding_wchar) return get_wchar_encoding();
+
+		// replace utf16 encoding with utf16 with specific endianness
+		if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+		// replace utf32 encoding with utf32 with specific endianness
+		if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+		// only do autodetection if no explicit encoding is requested
+		if (encoding != encoding_auto) return encoding;
+
+		// assume utf8 encoding
+		return encoding_utf8;
+	}
+
+#ifdef PUGIXML_WCHAR_MODE
+	PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+	{
+		assert(length > 0);
+
+		// discard last character if it's the lead of a surrogate pair 
+		return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
+	}
+
+	PUGI__FN size_t convert_buffer(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+	{
+		// only endian-swapping is required
+		if (need_endian_swap_utf(encoding, get_wchar_encoding()))
+		{
+			convert_wchar_endian_swap(r_char, data, length);
+
+			return length * sizeof(char_t);
+		}
+	
+		// convert to utf8
+		if (encoding == encoding_utf8)
+		{
+			uint8_t* dest = r_u8;
+			uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
+
+			return static_cast<size_t>(end - dest);
+		}
+
+		// convert to utf16
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			uint16_t* dest = r_u16;
+
+			// convert to native utf16
+			uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+		}
+
+		// convert to utf32
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			uint32_t* dest = r_u32;
+
+			// convert to native utf32
+			uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+		}
+
+		// convert to latin1
+		if (encoding == encoding_latin1)
+		{
+			uint8_t* dest = r_u8;
+			uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
+
+			return static_cast<size_t>(end - dest);
+		}
+
+		assert(!"Invalid encoding");
+		return 0;
+	}
+#else
+	PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+	{
+		assert(length > 4);
+
+		for (size_t i = 1; i <= 4; ++i)
+		{
+			uint8_t ch = static_cast<uint8_t>(data[length - i]);
+
+			// either a standalone character or a leading one
+			if ((ch & 0xc0) != 0x80) return length - i;
+		}
+
+		// there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
+		return length;
+	}
+
+	PUGI__FN size_t convert_buffer(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+	{
+		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+		{
+			uint16_t* dest = r_u16;
+
+			// convert to native utf16
+			uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+		}
+
+		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+		{
+			uint32_t* dest = r_u32;
+
+			// convert to native utf32
+			uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+			// swap if necessary
+			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+			return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+		}
+
+		if (encoding == encoding_latin1)
+		{
+			uint8_t* dest = r_u8;
+			uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+			return static_cast<size_t>(end - dest);
+		}
+
+		assert(!"Invalid encoding");
+		return 0;
+	}
+#endif
+
+	class xml_buffered_writer
+	{
+		xml_buffered_writer(const xml_buffered_writer&);
+		xml_buffered_writer& operator=(const xml_buffered_writer&);
+
+	public:
+		xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
+		{
+			PUGI__STATIC_ASSERT(bufcapacity >= 8);
+		}
+
+		~xml_buffered_writer()
+		{
+			flush();
+		}
+
+		void flush()
+		{
+			flush(buffer, bufsize);
+			bufsize = 0;
+		}
+
+		void flush(const char_t* data, size_t size)
+		{
+			if (size == 0) return;
+
+			// fast path, just write data
+			if (encoding == get_write_native_encoding())
+				writer.write(data, size * sizeof(char_t));
+			else
+			{
+				// convert chunk
+				size_t result = convert_buffer(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
+				assert(result <= sizeof(scratch));
+
+				// write data
+				writer.write(scratch.data_u8, result);
+			}
+		}
+
+		void write(const char_t* data, size_t length)
+		{
+			if (bufsize + length > bufcapacity)
+			{
+				// flush the remaining buffer contents
+				flush();
+
+				// handle large chunks
+				if (length > bufcapacity)
+				{
+					if (encoding == get_write_native_encoding())
+					{
+						// fast path, can just write data chunk
+						writer.write(data, length * sizeof(char_t));
+						return;
+					}
+
+					// need to convert in suitable chunks
+					while (length > bufcapacity)
+					{
+						// get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
+						// and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
+						size_t chunk_size = get_valid_length(data, bufcapacity);
+
+						// convert chunk and write
+						flush(data, chunk_size);
+
+						// iterate
+						data += chunk_size;
+						length -= chunk_size;
+					}
+
+					// small tail is copied below
+					bufsize = 0;
+				}
+			}
+
+			memcpy(buffer + bufsize, data, length * sizeof(char_t));
+			bufsize += length;
+		}
+
+		void write(const char_t* data)
+		{
+			write(data, strlength(data));
+		}
+
+		void write(char_t d0)
+		{
+			if (bufsize + 1 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			bufsize += 1;
+		}
+
+		void write(char_t d0, char_t d1)
+		{
+			if (bufsize + 2 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			bufsize += 2;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2)
+		{
+			if (bufsize + 3 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			bufsize += 3;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2, char_t d3)
+		{
+			if (bufsize + 4 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			buffer[bufsize + 3] = d3;
+			bufsize += 4;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
+		{
+			if (bufsize + 5 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			buffer[bufsize + 3] = d3;
+			buffer[bufsize + 4] = d4;
+			bufsize += 5;
+		}
+
+		void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
+		{
+			if (bufsize + 6 > bufcapacity) flush();
+
+			buffer[bufsize + 0] = d0;
+			buffer[bufsize + 1] = d1;
+			buffer[bufsize + 2] = d2;
+			buffer[bufsize + 3] = d3;
+			buffer[bufsize + 4] = d4;
+			buffer[bufsize + 5] = d5;
+			bufsize += 6;
+		}
+
+		// utf8 maximum expansion: x4 (-> utf32)
+		// utf16 maximum expansion: x2 (-> utf32)
+		// utf32 maximum expansion: x1
+		enum
+		{
+			bufcapacitybytes =
+			#ifdef PUGIXML_MEMORY_OUTPUT_STACK
+				PUGIXML_MEMORY_OUTPUT_STACK
+			#else
+				10240
+			#endif
+			,
+			bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
+		};
+
+		char_t buffer[bufcapacity];
+
+		union
+		{
+			uint8_t data_u8[4 * bufcapacity];
+			uint16_t data_u16[2 * bufcapacity];
+			uint32_t data_u32[bufcapacity];
+			char_t data_char[bufcapacity];
+		} scratch;
+
+		xml_writer& writer;
+		size_t bufsize;
+		xml_encoding encoding;
+	};
+
+	PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
+	{
+		while (*s)
+		{
+			const char_t* prev = s;
+			
+			// While *s is a usual symbol
+			while (!PUGI__IS_CHARTYPEX(*s, type)) ++s;
+		
+			writer.write(prev, static_cast<size_t>(s - prev));
+
+			switch (*s)
+			{
+				case 0: break;
+				case '&':
+					writer.write('&', 'a', 'm', 'p', ';');
+					++s;
+					break;
+				case '<':
+					writer.write('&', 'l', 't', ';');
+					++s;
+					break;
+				case '>':
+					writer.write('&', 'g', 't', ';');
+					++s;
+					break;
+				case '"':
+					writer.write('&', 'q', 'u', 'o', 't', ';');
+					++s;
+					break;
+				default: // s is not a usual symbol
+				{
+					unsigned int ch = static_cast<unsigned int>(*s++);
+					assert(ch < 32);
+
+					writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
+				}
+			}
+		}
+	}
+
+	PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
+	{
+		if (flags & format_no_escapes)
+			writer.write(s);
+		else
+			text_output_escaped(writer, s, type);
+	}
+
+	PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
+	{
+		do
+		{
+			writer.write('<', '!', '[', 'C', 'D');
+			writer.write('A', 'T', 'A', '[');
+
+			const char_t* prev = s;
+
+			// look for ]]> sequence - we can't output it as is since it terminates CDATA
+			while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
+
+			// skip ]] if we stopped at ]]>, > will go to the next CDATA section
+			if (*s) s += 2;
+
+			writer.write(prev, static_cast<size_t>(s - prev));
+
+			writer.write(']', ']', '>');
+		}
+		while (*s);
+	}
+
+	PUGI__FN void node_output_attributes(xml_buffered_writer& writer, const xml_node& node, unsigned int flags)
+	{
+		const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+		for (xml_attribute a = node.first_attribute(); a; a = a.next_attribute())
+		{
+			writer.write(' ');
+			writer.write(a.name()[0] ? a.name() : default_name);
+			writer.write('=', '"');
+
+			text_output(writer, a.value(), ctx_special_attr, flags);
+
+			writer.write('"');
+		}
+	}
+
+	PUGI__FN void node_output(xml_buffered_writer& writer, const xml_node& node, const char_t* indent, unsigned int flags, unsigned int depth)
+	{
+		const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+		if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
+			for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
+
+		switch (node.type())
+		{
+		case node_document:
+		{
+			for (xml_node n = node.first_child(); n; n = n.next_sibling())
+				node_output(writer, n, indent, flags, depth);
+			break;
+		}
+			
+		case node_element:
+		{
+			const char_t* name = node.name()[0] ? node.name() : default_name;
+
+			writer.write('<');
+			writer.write(name);
+
+			node_output_attributes(writer, node, flags);
+
+			if (flags & format_raw)
+			{
+				if (!node.first_child())
+					writer.write(' ', '/', '>');
+				else
+				{
+					writer.write('>');
+
+					for (xml_node n = node.first_child(); n; n = n.next_sibling())
+						node_output(writer, n, indent, flags, depth + 1);
+
+					writer.write('<', '/');
+					writer.write(name);
+					writer.write('>');
+				}
+			}
+			else if (!node.first_child())
+				writer.write(' ', '/', '>', '\n');
+			else if (node.first_child() == node.last_child() && (node.first_child().type() == node_pcdata || node.first_child().type() == node_cdata))
+			{
+				writer.write('>');
+
+				if (node.first_child().type() == node_pcdata)
+					text_output(writer, node.first_child().value(), ctx_special_pcdata, flags);
+				else
+					text_output_cdata(writer, node.first_child().value());
+
+				writer.write('<', '/');
+				writer.write(name);
+				writer.write('>', '\n');
+			}
+			else
+			{
+				writer.write('>', '\n');
+				
+				for (xml_node n = node.first_child(); n; n = n.next_sibling())
+					node_output(writer, n, indent, flags, depth + 1);
+
+				if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
+					for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
+				
+				writer.write('<', '/');
+				writer.write(name);
+				writer.write('>', '\n');
+			}
+
+			break;
+		}
+		
+		case node_pcdata:
+			text_output(writer, node.value(), ctx_special_pcdata, flags);
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_cdata:
+			text_output_cdata(writer, node.value());
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_comment:
+			writer.write('<', '!', '-', '-');
+			writer.write(node.value());
+			writer.write('-', '-', '>');
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_pi:
+		case node_declaration:
+			writer.write('<', '?');
+			writer.write(node.name()[0] ? node.name() : default_name);
+
+			if (node.type() == node_declaration)
+			{
+				node_output_attributes(writer, node, flags);
+			}
+			else if (node.value()[0])
+			{
+				writer.write(' ');
+				writer.write(node.value());
+			}
+
+			writer.write('?', '>');
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		case node_doctype:
+			writer.write('<', '!', 'D', 'O', 'C');
+			writer.write('T', 'Y', 'P', 'E');
+
+			if (node.value()[0])
+			{
+				writer.write(' ');
+				writer.write(node.value());
+			}
+
+			writer.write('>');
+			if ((flags & format_raw) == 0) writer.write('\n');
+			break;
+
+		default:
+			assert(!"Invalid node type");
+		}
+	}
+
+	inline bool has_declaration(const xml_node& node)
+	{
+		for (xml_node child = node.first_child(); child; child = child.next_sibling())
+		{
+			xml_node_type type = child.type();
+
+			if (type == node_declaration) return true;
+			if (type == node_element) return false;
+		}
+
+		return false;
+	}
+
+	inline bool allow_insert_child(xml_node_type parent, xml_node_type child)
+	{
+		if (parent != node_document && parent != node_element) return false;
+		if (child == node_document || child == node_null) return false;
+		if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
+
+		return true;
+	}
+
+	PUGI__FN void recursive_copy_skip(xml_node& dest, const xml_node& source, const xml_node& skip)
+	{
+		assert(dest.type() == source.type());
+
+		switch (source.type())
+		{
+		case node_element:
+		{
+			dest.set_name(source.name());
+
+			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
+				dest.append_attribute(a.name()).set_value(a.value());
+
+			for (xml_node c = source.first_child(); c; c = c.next_sibling())
+			{
+				if (c == skip) continue;
+
+				xml_node cc = dest.append_child(c.type());
+				assert(cc);
+
+				recursive_copy_skip(cc, c, skip);
+			}
+
+			break;
+		}
+
+		case node_pcdata:
+		case node_cdata:
+		case node_comment:
+		case node_doctype:
+			dest.set_value(source.value());
+			break;
+
+		case node_pi:
+			dest.set_name(source.name());
+			dest.set_value(source.value());
+			break;
+
+		case node_declaration:
+		{
+			dest.set_name(source.name());
+
+			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
+				dest.append_attribute(a.name()).set_value(a.value());
+
+			break;
+		}
+
+		default:
+			assert(!"Invalid node type");
+		}
+	}
+
+	inline bool is_text_node(xml_node_struct* node)
+	{
+		xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1);
+
+		return type == node_pcdata || type == node_cdata;
+	}
+
+	// get value with conversion functions
+	PUGI__FN int get_value_int(const char_t* value, int def)
+	{
+		if (!value) return def;
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return static_cast<int>(wcstol(value, 0, 10));
+	#else
+		return static_cast<int>(strtol(value, 0, 10));
+	#endif
+	}
+
+	PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
+	{
+		if (!value) return def;
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return static_cast<unsigned int>(wcstoul(value, 0, 10));
+	#else
+		return static_cast<unsigned int>(strtoul(value, 0, 10));
+	#endif
+	}
+
+	PUGI__FN double get_value_double(const char_t* value, double def)
+	{
+		if (!value) return def;
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcstod(value, 0);
+	#else
+		return strtod(value, 0);
+	#endif
+	}
+
+	PUGI__FN float get_value_float(const char_t* value, float def)
+	{
+		if (!value) return def;
+
+	#ifdef PUGIXML_WCHAR_MODE
+		return static_cast<float>(wcstod(value, 0));
+	#else
+		return static_cast<float>(strtod(value, 0));
+	#endif
+	}
+
+	PUGI__FN bool get_value_bool(const char_t* value, bool def)
+	{
+		if (!value) return def;
+
+		// only look at first char
+		char_t first = *value;
+
+		// 1*, t* (true), T* (True), y* (yes), Y* (YES)
+		return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
+	}
+
+	// set value with conversion functions
+	PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128])
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		char_t wbuf[128];
+		impl::widen_ascii(wbuf, buf);
+
+		return strcpy_insitu(dest, header, header_mask, wbuf);
+	#else
+		return strcpy_insitu(dest, header, header_mask, buf);
+	#endif
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value)
+	{
+		char buf[128];
+		sprintf(buf, "%d", value);
+	
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value)
+	{
+		char buf[128];
+		sprintf(buf, "%u", value);
+
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value)
+	{
+		char buf[128];
+		sprintf(buf, "%g", value);
+
+		return set_value_buffer(dest, header, header_mask, buf);
+	}
+	
+	PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value)
+	{
+		return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+	}
+
+	// we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
+	PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result)
+	{
+	#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+		// there are 64-bit versions of fseek/ftell, let's use them
+		typedef __int64 length_type;
+
+		_fseeki64(file, 0, SEEK_END);
+		length_type length = _ftelli64(file);
+		_fseeki64(file, 0, SEEK_SET);
+	#elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && !defined(__STRICT_ANSI__)
+		// there are 64-bit versions of fseek/ftell, let's use them
+		typedef off64_t length_type;
+
+		fseeko64(file, 0, SEEK_END);
+		length_type length = ftello64(file);
+		fseeko64(file, 0, SEEK_SET);
+	#else
+		// if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
+		typedef long length_type;
+
+		fseek(file, 0, SEEK_END);
+		length_type length = ftell(file);
+		fseek(file, 0, SEEK_SET);
+	#endif
+
+		// check for I/O errors
+		if (length < 0) return status_io_error;
+		
+		// check for overflow
+		size_t result = static_cast<size_t>(length);
+
+		if (static_cast<length_type>(result) != length) return status_out_of_memory;
+
+		// finalize
+		out_result = result;
+
+		return status_ok;
+	}
+
+	PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
+	{
+		if (!file) return make_parse_result(status_file_not_found);
+
+		// get file size (can result in I/O errors)
+		size_t size = 0;
+		xml_parse_status size_status = get_file_size(file, size);
+
+		if (size_status != status_ok)
+		{
+			fclose(file);
+			return make_parse_result(size_status);
+		}
+		
+		// allocate buffer for the whole file
+		char* contents = static_cast<char*>(xml_memory::allocate(size > 0 ? size : 1));
+
+		if (!contents)
+		{
+			fclose(file);
+			return make_parse_result(status_out_of_memory);
+		}
+
+		// read file in memory
+		size_t read_size = fread(contents, 1, size, file);
+		fclose(file);
+
+		if (read_size != size)
+		{
+			xml_memory::deallocate(contents);
+			return make_parse_result(status_io_error);
+		}
+		
+		return doc.load_buffer_inplace_own(contents, size, options, encoding);
+	}
+
+#ifndef PUGIXML_NO_STL
+	template <typename T> struct xml_stream_chunk
+	{
+		static xml_stream_chunk* create()
+		{
+			void* memory = xml_memory::allocate(sizeof(xml_stream_chunk));
+			
+			return new (memory) xml_stream_chunk();
+		}
+
+		static void destroy(void* ptr)
+		{
+			xml_stream_chunk* chunk = static_cast<xml_stream_chunk*>(ptr);
+
+			// free chunk chain
+			while (chunk)
+			{
+				xml_stream_chunk* next = chunk->next;
+				xml_memory::deallocate(chunk);
+				chunk = next;
+			}
+		}
+
+		xml_stream_chunk(): next(0), size(0)
+		{
+		}
+
+		xml_stream_chunk* next;
+		size_t size;
+
+		T data[xml_memory_page_size / sizeof(T)];
+	};
+
+	template <typename T> PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+	{
+		buffer_holder chunks(0, xml_stream_chunk<T>::destroy);
+
+		// read file to a chunk list
+		size_t total = 0;
+		xml_stream_chunk<T>* last = 0;
+
+		while (!stream.eof())
+		{
+			// allocate new chunk
+			xml_stream_chunk<T>* chunk = xml_stream_chunk<T>::create();
+			if (!chunk) return status_out_of_memory;
+
+			// append chunk to list
+			if (last) last = last->next = chunk;
+			else chunks.data = last = chunk;
+
+			// read data to chunk
+			stream.read(chunk->data, static_cast<std::streamsize>(sizeof(chunk->data) / sizeof(T)));
+			chunk->size = static_cast<size_t>(stream.gcount()) * sizeof(T);
+
+			// read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors
+			if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+			// guard against huge files (chunk size is small enough to make this overflow check work)
+			if (total + chunk->size < total) return status_out_of_memory;
+			total += chunk->size;
+		}
+
+		// copy chunk list to a contiguous buffer
+		char* buffer = static_cast<char*>(xml_memory::allocate(total));
+		if (!buffer) return status_out_of_memory;
+
+		char* write = buffer;
+
+		for (xml_stream_chunk<T>* chunk = static_cast<xml_stream_chunk<T>*>(chunks.data); chunk; chunk = chunk->next)
+		{
+			assert(write + chunk->size <= buffer + total);
+			memcpy(write, chunk->data, chunk->size);
+			write += chunk->size;
+		}
+
+		assert(write == buffer + total);
+
+		// return buffer
+		*out_buffer = buffer;
+		*out_size = total;
+
+		return status_ok;
+	}
+
+	template <typename T> PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+	{
+		// get length of remaining data in stream
+		typename std::basic_istream<T>::pos_type pos = stream.tellg();
+		stream.seekg(0, std::ios::end);
+		std::streamoff length = stream.tellg() - pos;
+		stream.seekg(pos);
+
+		if (stream.fail() || pos < 0) return status_io_error;
+
+		// guard against huge files
+		size_t read_length = static_cast<size_t>(length);
+
+		if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory;
+
+		// read stream data into memory (guard against stream exceptions with buffer holder)
+		buffer_holder buffer(xml_memory::allocate((read_length > 0 ? read_length : 1) * sizeof(T)), xml_memory::deallocate);
+		if (!buffer.data) return status_out_of_memory;
+
+		stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
+
+		// read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
+		if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+		// return buffer
+		size_t actual_length = static_cast<size_t>(stream.gcount());
+		assert(actual_length <= read_length);
+
+		*out_buffer = buffer.release();
+		*out_size = actual_length * sizeof(T);
+
+		return status_ok;
+	}
+
+	template <typename T> PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
+	{
+		void* buffer = 0;
+		size_t size = 0;
+
+		// load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory)
+		xml_parse_status status = (stream.tellg() < 0) ? load_stream_data_noseek(stream, &buffer, &size) : load_stream_data_seek(stream, &buffer, &size);
+		if (status != status_ok) return make_parse_result(status);
+
+		return doc.load_buffer_inplace_own(buffer, size, options, encoding);
+	}
+#endif
+
+#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && !defined(__STRICT_ANSI__))
+	PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+	{
+		return _wfopen(path, mode);
+	}
+#else
+	PUGI__FN char* convert_path_heap(const wchar_t* str)
+	{
+		assert(str);
+
+		// first pass: get length in utf8 characters
+		size_t length = wcslen(str);
+		size_t size = as_utf8_begin(str, length);
+
+		// allocate resulting string
+		char* result = static_cast<char*>(xml_memory::allocate(size + 1));
+		if (!result) return 0;
+
+		// second pass: convert to utf8
+		as_utf8_end(result, size, str, length);
+
+		return result;
+	}
+
+	PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+	{
+		// there is no standard function to open wide paths, so our best bet is to try utf8 path
+		char* path_utf8 = convert_path_heap(path);
+		if (!path_utf8) return 0;
+
+		// convert mode to ASCII (we mirror _wfopen interface)
+		char mode_ascii[4] = {0};
+		for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
+
+		// try to open the utf8 path
+		FILE* result = fopen(path_utf8, mode_ascii);
+
+		// free dummy buffer
+		xml_memory::deallocate(path_utf8);
+
+		return result;
+	}
+#endif
+
+	PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding)
+	{
+		if (!file) return false;
+
+		xml_writer_file writer(file);
+		doc.save(writer, indent, flags, encoding);
+
+		int result = ferror(file);
+
+		fclose(file);
+
+		return result == 0;
+	}
+PUGI__NS_END
+
+namespace pugi
+{
+	PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_)
+	{
+	}
+
+	PUGI__FN void xml_writer_file::write(const void* data, size_t size)
+	{
+		size_t result = fwrite(data, 1, size, static_cast<FILE*>(file));
+		(void)!result; // unfortunately we can't do proper error handling here
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
+	{
+	}
+
+	PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
+	{
+	}
+
+	PUGI__FN void xml_writer_stream::write(const void* data, size_t size)
+	{
+		if (narrow_stream)
+		{
+			assert(!wide_stream);
+			narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
+		}
+		else
+		{
+			assert(wide_stream);
+			assert(size % sizeof(wchar_t) == 0);
+
+			wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
+		}
+	}
+#endif
+
+	PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0)
+	{
+	}
+	
+	PUGI__FN xml_tree_walker::~xml_tree_walker()
+	{
+	}
+
+	PUGI__FN int xml_tree_walker::depth() const
+	{
+		return _depth;
+	}
+
+	PUGI__FN bool xml_tree_walker::begin(xml_node&)
+	{
+		return true;
+	}
+
+	PUGI__FN bool xml_tree_walker::end(xml_node&)
+	{
+		return true;
+	}
+
+	PUGI__FN xml_attribute::xml_attribute(): _attr(0)
+	{
+	}
+
+	PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
+	{
+	}
+
+	PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***)
+	{
+	}
+
+	PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const
+	{
+		return _attr ? unspecified_bool_xml_attribute : 0;
+	}
+
+	PUGI__FN bool xml_attribute::operator!() const
+	{
+		return !_attr;
+	}
+
+	PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const
+	{
+		return (_attr == r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const
+	{
+		return (_attr != r._attr);
+	}
+
+	PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const
+	{
+		return (_attr < r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const
+	{
+		return (_attr > r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const
+	{
+		return (_attr <= r._attr);
+	}
+	
+	PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const
+	{
+		return (_attr >= r._attr);
+	}
+
+	PUGI__FN xml_attribute xml_attribute::next_attribute() const
+	{
+		return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
+	}
+
+	PUGI__FN xml_attribute xml_attribute::previous_attribute() const
+	{
+		return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
+	}
+
+	PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const
+	{
+		return (_attr && _attr->value) ? _attr->value : def;
+	}
+
+	PUGI__FN int xml_attribute::as_int(int def) const
+	{
+		return impl::get_value_int(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const
+	{
+		return impl::get_value_uint(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN double xml_attribute::as_double(double def) const
+	{
+		return impl::get_value_double(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN float xml_attribute::as_float(float def) const
+	{
+		return impl::get_value_float(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN bool xml_attribute::as_bool(bool def) const
+	{
+		return impl::get_value_bool(_attr ? _attr->value : 0, def);
+	}
+
+	PUGI__FN bool xml_attribute::empty() const
+	{
+		return !_attr;
+	}
+
+	PUGI__FN const char_t* xml_attribute::name() const
+	{
+		return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* xml_attribute::value() const
+	{
+		return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN size_t xml_attribute::hash_value() const
+	{
+		return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
+	}
+
+	PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const
+	{
+		return _attr;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+	
+	PUGI__FN xml_attribute& xml_attribute::operator=(int rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute::operator=(double rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+	
+	PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs)
+	{
+		set_value(rhs);
+		return *this;
+	}
+
+	PUGI__FN bool xml_attribute::set_name(const char_t* rhs)
+	{
+		if (!_attr) return false;
+		
+		return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs);
+	}
+		
+	PUGI__FN bool xml_attribute::set_value(const char_t* rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(int rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(unsigned int rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+	PUGI__FN bool xml_attribute::set_value(double rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+	
+	PUGI__FN bool xml_attribute::set_value(bool rhs)
+	{
+		if (!_attr) return false;
+
+		return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN xml_node::xml_node(): _root(0)
+	{
+	}
+
+	PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p)
+	{
+	}
+	
+	PUGI__FN static void unspecified_bool_xml_node(xml_node***)
+	{
+	}
+
+	PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const
+	{
+		return _root ? unspecified_bool_xml_node : 0;
+	}
+
+	PUGI__FN bool xml_node::operator!() const
+	{
+		return !_root;
+	}
+
+	PUGI__FN xml_node::iterator xml_node::begin() const
+	{
+		return iterator(_root ? _root->first_child : 0, _root);
+	}
+
+	PUGI__FN xml_node::iterator xml_node::end() const
+	{
+		return iterator(0, _root);
+	}
+	
+	PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const
+	{
+		return attribute_iterator(_root ? _root->first_attribute : 0, _root);
+	}
+
+	PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const
+	{
+		return attribute_iterator(0, _root);
+	}
+	
+	PUGI__FN xml_object_range<xml_node_iterator> xml_node::children() const
+	{
+		return xml_object_range<xml_node_iterator>(begin(), end());
+	}
+
+	PUGI__FN xml_object_range<xml_named_node_iterator> xml_node::children(const char_t* name_) const
+	{
+		return xml_object_range<xml_named_node_iterator>(xml_named_node_iterator(child(name_), name_), xml_named_node_iterator());
+	}
+
+	PUGI__FN xml_object_range<xml_attribute_iterator> xml_node::attributes() const
+	{
+		return xml_object_range<xml_attribute_iterator>(attributes_begin(), attributes_end());
+	}
+
+	PUGI__FN bool xml_node::operator==(const xml_node& r) const
+	{
+		return (_root == r._root);
+	}
+
+	PUGI__FN bool xml_node::operator!=(const xml_node& r) const
+	{
+		return (_root != r._root);
+	}
+
+	PUGI__FN bool xml_node::operator<(const xml_node& r) const
+	{
+		return (_root < r._root);
+	}
+	
+	PUGI__FN bool xml_node::operator>(const xml_node& r) const
+	{
+		return (_root > r._root);
+	}
+	
+	PUGI__FN bool xml_node::operator<=(const xml_node& r) const
+	{
+		return (_root <= r._root);
+	}
+	
+	PUGI__FN bool xml_node::operator>=(const xml_node& r) const
+	{
+		return (_root >= r._root);
+	}
+
+	PUGI__FN bool xml_node::empty() const
+	{
+		return !_root;
+	}
+	
+	PUGI__FN const char_t* xml_node::name() const
+	{
+		return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN xml_node_type xml_node::type() const
+	{
+		return _root ? static_cast<xml_node_type>((_root->header & impl::xml_memory_page_type_mask) + 1) : node_null;
+	}
+	
+	PUGI__FN const char_t* xml_node::value() const
+	{
+		return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
+	}
+	
+	PUGI__FN xml_node xml_node::child(const char_t* name_) const
+	{
+		if (!_root) return xml_node();
+
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const
+	{
+		if (!_root) return xml_attribute();
+
+		for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
+			if (i->name && impl::strequal(name_, i->name))
+				return xml_attribute(i);
+		
+		return xml_attribute();
+	}
+	
+	PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
+			if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::next_sibling() const
+	{
+		if (!_root) return xml_node();
+		
+		if (_root->next_sibling) return xml_node(_root->next_sibling);
+		else return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
+			if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::previous_sibling() const
+	{
+		if (!_root) return xml_node();
+		
+		if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
+		else return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::parent() const
+	{
+		return _root ? xml_node(_root->parent) : xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::root() const
+	{
+		if (!_root) return xml_node();
+
+		impl::xml_memory_page* page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+
+		return xml_node(static_cast<impl::xml_document_struct*>(page->allocator));
+	}
+
+	PUGI__FN xml_text xml_node::text() const
+	{
+		return xml_text(_root);
+	}
+
+	PUGI__FN const char_t* xml_node::child_value() const
+	{
+		if (!_root) return PUGIXML_TEXT("");
+		
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if (i->value && impl::is_text_node(i))
+				return i->value;
+
+		return PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const
+	{
+		return child(name_).child_value();
+	}
+
+	PUGI__FN xml_attribute xml_node::first_attribute() const
+	{
+		return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
+	}
+
+	PUGI__FN xml_attribute xml_node::last_attribute() const
+	{
+		return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
+	}
+
+	PUGI__FN xml_node xml_node::first_child() const
+	{
+		return _root ? xml_node(_root->first_child) : xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::last_child() const
+	{
+		return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
+	}
+
+	PUGI__FN bool xml_node::set_name(const char_t* rhs)
+	{
+		switch (type())
+		{
+		case node_pi:
+		case node_declaration:
+		case node_element:
+			return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs);
+
+		default:
+			return false;
+		}
+	}
+		
+	PUGI__FN bool xml_node::set_value(const char_t* rhs)
+	{
+		switch (type())
+		{
+		case node_pi:
+		case node_cdata:
+		case node_pcdata:
+		case node_comment:
+		case node_doctype:
+			return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs);
+
+		default:
+			return false;
+		}
+	}
+
+	PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_)
+	{
+		if (type() != node_element && type() != node_declaration) return xml_attribute();
+		
+		xml_attribute a(impl::append_attribute_ll(_root, impl::get_allocator(_root)));
+		a.set_name(name_);
+		
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_)
+	{
+		if (type() != node_element && type() != node_declaration) return xml_attribute();
+		
+		xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+		if (!a) return xml_attribute();
+
+		a.set_name(name_);
+		
+		xml_attribute_struct* head = _root->first_attribute;
+
+		if (head)
+		{
+			a._attr->prev_attribute_c = head->prev_attribute_c;
+			head->prev_attribute_c = a._attr;
+		}
+		else
+			a._attr->prev_attribute_c = a._attr;
+		
+		a._attr->next_attribute = head;
+		_root->first_attribute = a._attr;
+				
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr)
+	{
+		if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
+		
+		// check that attribute belongs to *this
+		xml_attribute_struct* cur = attr._attr;
+
+		while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
+
+		if (cur != _root->first_attribute) return xml_attribute();
+
+		xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+		if (!a) return xml_attribute();
+
+		a.set_name(name_);
+
+		if (attr._attr->prev_attribute_c->next_attribute)
+			attr._attr->prev_attribute_c->next_attribute = a._attr;
+		else
+			_root->first_attribute = a._attr;
+		
+		a._attr->prev_attribute_c = attr._attr->prev_attribute_c;
+		a._attr->next_attribute = attr._attr;
+		attr._attr->prev_attribute_c = a._attr;
+				
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr)
+	{
+		if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
+		
+		// check that attribute belongs to *this
+		xml_attribute_struct* cur = attr._attr;
+
+		while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
+
+		if (cur != _root->first_attribute) return xml_attribute();
+
+		xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+		if (!a) return xml_attribute();
+
+		a.set_name(name_);
+
+		if (attr._attr->next_attribute)
+			attr._attr->next_attribute->prev_attribute_c = a._attr;
+		else
+			_root->first_attribute->prev_attribute_c = a._attr;
+		
+		a._attr->next_attribute = attr._attr->next_attribute;
+		a._attr->prev_attribute_c = attr._attr;
+		attr._attr->next_attribute = a._attr;
+
+		return a;
+	}
+
+	PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = append_attribute(proto.name());
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = prepend_attribute(proto.name());
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = insert_attribute_after(proto.name(), attr);
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
+	{
+		if (!proto) return xml_attribute();
+
+		xml_attribute result = insert_attribute_before(proto.name(), attr);
+		result.set_value(proto.value());
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::append_child(xml_node_type type_)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		
+		xml_node n(impl::append_node(_root, impl::get_allocator(_root), type_));
+
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		
+		xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+		if (!n) return xml_node();
+
+		n._root->parent = _root;
+
+		xml_node_struct* head = _root->first_child;
+
+		if (head)
+		{
+			n._root->prev_sibling_c = head->prev_sibling_c;
+			head->prev_sibling_c = n._root;
+		}
+		else
+			n._root->prev_sibling_c = n._root;
+		
+		n._root->next_sibling = head;
+		_root->first_child = n._root;
+				
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		if (!node._root || node._root->parent != _root) return xml_node();
+	
+		xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+		if (!n) return xml_node();
+
+		n._root->parent = _root;
+		
+		if (node._root->prev_sibling_c->next_sibling)
+			node._root->prev_sibling_c->next_sibling = n._root;
+		else
+			_root->first_child = n._root;
+		
+		n._root->prev_sibling_c = node._root->prev_sibling_c;
+		n._root->next_sibling = node._root;
+		node._root->prev_sibling_c = n._root;
+
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node)
+	{
+		if (!impl::allow_insert_child(this->type(), type_)) return xml_node();
+		if (!node._root || node._root->parent != _root) return xml_node();
+	
+		xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+		if (!n) return xml_node();
+
+		n._root->parent = _root;
+	
+		if (node._root->next_sibling)
+			node._root->next_sibling->prev_sibling_c = n._root;
+		else
+			_root->first_child->prev_sibling_c = n._root;
+		
+		n._root->next_sibling = node._root->next_sibling;
+		n._root->prev_sibling_c = node._root;
+		node._root->next_sibling = n._root;
+
+		if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+		return n;
+	}
+
+	PUGI__FN xml_node xml_node::append_child(const char_t* name_)
+	{
+		xml_node result = append_child(node_element);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::prepend_child(const char_t* name_)
+	{
+		xml_node result = prepend_child(node_element);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node)
+	{
+		xml_node result = insert_child_after(node_element, node);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node)
+	{
+		xml_node result = insert_child_before(node_element, node);
+
+		result.set_name(name_);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::append_copy(const xml_node& proto)
+	{
+		xml_node result = append_child(proto.type());
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto)
+	{
+		xml_node result = prepend_child(proto.type());
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
+	{
+		xml_node result = insert_child_after(proto.type(), node);
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
+	{
+		xml_node result = insert_child_before(proto.type(), node);
+
+		if (result) impl::recursive_copy_skip(result, proto, result);
+
+		return result;
+	}
+
+	PUGI__FN bool xml_node::remove_attribute(const char_t* name_)
+	{
+		return remove_attribute(attribute(name_));
+	}
+
+	PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a)
+	{
+		if (!_root || !a._attr) return false;
+
+		// check that attribute belongs to *this
+		xml_attribute_struct* attr = a._attr;
+
+		while (attr->prev_attribute_c->next_attribute) attr = attr->prev_attribute_c;
+
+		if (attr != _root->first_attribute) return false;
+
+		if (a._attr->next_attribute) a._attr->next_attribute->prev_attribute_c = a._attr->prev_attribute_c;
+		else if (_root->first_attribute) _root->first_attribute->prev_attribute_c = a._attr->prev_attribute_c;
+		
+		if (a._attr->prev_attribute_c->next_attribute) a._attr->prev_attribute_c->next_attribute = a._attr->next_attribute;
+		else _root->first_attribute = a._attr->next_attribute;
+
+		impl::destroy_attribute(a._attr, impl::get_allocator(_root));
+
+		return true;
+	}
+
+	PUGI__FN bool xml_node::remove_child(const char_t* name_)
+	{
+		return remove_child(child(name_));
+	}
+
+	PUGI__FN bool xml_node::remove_child(const xml_node& n)
+	{
+		if (!_root || !n._root || n._root->parent != _root) return false;
+
+		if (n._root->next_sibling) n._root->next_sibling->prev_sibling_c = n._root->prev_sibling_c;
+		else if (_root->first_child) _root->first_child->prev_sibling_c = n._root->prev_sibling_c;
+		
+		if (n._root->prev_sibling_c->next_sibling) n._root->prev_sibling_c->next_sibling = n._root->next_sibling;
+		else _root->first_child = n._root->next_sibling;
+		
+		impl::destroy_node(n._root, impl::get_allocator(_root));
+
+		return true;
+	}
+
+	PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if (i->name && impl::strequal(name_, i->name))
+			{
+				for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+					if (impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value))
+						return xml_node(i);
+			}
+
+		return xml_node();
+	}
+
+	PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
+	{
+		if (!_root) return xml_node();
+		
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+				if (impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value))
+					return xml_node(i);
+
+		return xml_node();
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN string_t xml_node::path(char_t delimiter) const
+	{
+		xml_node cursor = *this; // Make a copy.
+		
+		string_t result = cursor.name();
+
+		while (cursor.parent())
+		{
+			cursor = cursor.parent();
+			
+			string_t temp = cursor.name();
+			temp += delimiter;
+			temp += result;
+			result.swap(temp);
+		}
+
+		return result;
+	}
+#endif
+
+	PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const
+	{
+		xml_node found = *this; // Current search context.
+
+		if (!_root || !path_ || !path_[0]) return found;
+
+		if (path_[0] == delimiter)
+		{
+			// Absolute path; e.g. '/foo/bar'
+			found = found.root();
+			++path_;
+		}
+
+		const char_t* path_segment = path_;
+
+		while (*path_segment == delimiter) ++path_segment;
+
+		const char_t* path_segment_end = path_segment;
+
+		while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
+
+		if (path_segment == path_segment_end) return found;
+
+		const char_t* next_segment = path_segment_end;
+
+		while (*next_segment == delimiter) ++next_segment;
+
+		if (*path_segment == '.' && path_segment + 1 == path_segment_end)
+			return found.first_element_by_path(next_segment, delimiter);
+		else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
+			return found.parent().first_element_by_path(next_segment, delimiter);
+		else
+		{
+			for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
+			{
+				if (j->name && impl::strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
+				{
+					xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
+
+					if (subsearch) return subsearch;
+				}
+			}
+
+			return xml_node();
+		}
+	}
+
+	PUGI__FN bool xml_node::traverse(xml_tree_walker& walker)
+	{
+		walker._depth = -1;
+		
+		xml_node arg_begin = *this;
+		if (!walker.begin(arg_begin)) return false;
+
+		xml_node cur = first_child();
+				
+		if (cur)
+		{
+			++walker._depth;
+
+			do 
+			{
+				xml_node arg_for_each = cur;
+				if (!walker.for_each(arg_for_each))
+					return false;
+						
+				if (cur.first_child())
+				{
+					++walker._depth;
+					cur = cur.first_child();
+				}
+				else if (cur.next_sibling())
+					cur = cur.next_sibling();
+				else
+				{
+					// Borland C++ workaround
+					while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+					{
+						--walker._depth;
+						cur = cur.parent();
+					}
+						
+					if (cur != *this)
+						cur = cur.next_sibling();
+				}
+			}
+			while (cur && cur != *this);
+		}
+
+		assert(walker._depth == -1);
+
+		xml_node arg_end = *this;
+		return walker.end(arg_end);
+	}
+
+	PUGI__FN size_t xml_node::hash_value() const
+	{
+		return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
+	}
+
+	PUGI__FN xml_node_struct* xml_node::internal_object() const
+	{
+		return _root;
+	}
+
+	PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+	{
+		if (!_root) return;
+
+		impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+		impl::node_output(buffered_writer, *this, indent, flags, depth);
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+	{
+		xml_writer_stream writer(stream);
+
+		print(writer, indent, flags, encoding, depth);
+	}
+
+	PUGI__FN void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
+	{
+		xml_writer_stream writer(stream);
+
+		print(writer, indent, flags, encoding_wchar, depth);
+	}
+#endif
+
+	PUGI__FN ptrdiff_t xml_node::offset_debug() const
+	{
+		xml_node_struct* r = root()._root;
+
+		if (!r) return -1;
+
+		const char_t* buffer = static_cast<impl::xml_document_struct*>(r)->buffer;
+
+		if (!buffer) return -1;
+
+		switch (type())
+		{
+		case node_document:
+			return 0;
+
+		case node_element:
+		case node_declaration:
+		case node_pi:
+			return (_root->header & impl::xml_memory_page_name_allocated_mask) ? -1 : _root->name - buffer;
+
+		case node_pcdata:
+		case node_cdata:
+		case node_comment:
+		case node_doctype:
+			return (_root->header & impl::xml_memory_page_value_allocated_mask) ? -1 : _root->value - buffer;
+
+		default:
+			return -1;
+		}
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xml_node& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xml_node& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root)
+	{
+	}
+
+	PUGI__FN xml_node_struct* xml_text::_data() const
+	{
+		if (!_root || impl::is_text_node(_root)) return _root;
+
+		for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
+			if (impl::is_text_node(node))
+				return node;
+
+		return 0;
+	}
+
+	PUGI__FN xml_node_struct* xml_text::_data_new()
+	{
+		xml_node_struct* d = _data();
+		if (d) return d;
+
+		return xml_node(_root).append_child(node_pcdata).internal_object();
+	}
+
+	PUGI__FN xml_text::xml_text(): _root(0)
+	{
+	}
+
+	PUGI__FN static void unspecified_bool_xml_text(xml_text***)
+	{
+	}
+
+	PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const
+	{
+		return _data() ? unspecified_bool_xml_text : 0;
+	}
+
+	PUGI__FN bool xml_text::operator!() const
+	{
+		return !_data();
+	}
+
+	PUGI__FN bool xml_text::empty() const
+	{
+		return _data() == 0;
+	}
+
+	PUGI__FN const char_t* xml_text::get() const
+	{
+		xml_node_struct* d = _data();
+
+		return (d && d->value) ? d->value : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* xml_text::as_string(const char_t* def) const
+	{
+		xml_node_struct* d = _data();
+
+		return (d && d->value) ? d->value : def;
+	}
+
+	PUGI__FN int xml_text::as_int(int def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_int(d ? d->value : 0, def);
+	}
+
+	PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_uint(d ? d->value : 0, def);
+	}
+
+	PUGI__FN double xml_text::as_double(double def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_double(d ? d->value : 0, def);
+	}
+
+	PUGI__FN float xml_text::as_float(float def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_float(d ? d->value : 0, def);
+	}
+
+	PUGI__FN bool xml_text::as_bool(bool def) const
+	{
+		xml_node_struct* d = _data();
+
+		return impl::get_value_bool(d ? d->value : 0, def);
+	}
+
+	PUGI__FN bool xml_text::set(const char_t* rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(int rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(unsigned int rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(double rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN bool xml_text::set(bool rhs)
+	{
+		xml_node_struct* dn = _data_new();
+
+		return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(const char_t* rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(int rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(unsigned int rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(double rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_text& xml_text::operator=(bool rhs)
+	{
+		set(rhs);
+		return *this;
+	}
+
+	PUGI__FN xml_node xml_text::data() const
+	{
+		return xml_node(_data());
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xml_text& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xml_text& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN xml_node_iterator::xml_node_iterator()
+	{
+	}
+
+	PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
+	{
+	}
+
+	PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+	{
+	}
+
+	PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
+	{
+		return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+	}
+	
+	PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
+	{
+		return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+	}
+
+	PUGI__FN xml_node& xml_node_iterator::operator*() const
+	{
+		assert(_wrap._root);
+		return _wrap;
+	}
+
+	PUGI__FN xml_node* xml_node_iterator::operator->() const
+	{
+		assert(_wrap._root);
+		return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+	}
+
+	PUGI__FN const xml_node_iterator& xml_node_iterator::operator++()
+	{
+		assert(_wrap._root);
+		_wrap._root = _wrap._root->next_sibling;
+		return *this;
+	}
+
+	PUGI__FN xml_node_iterator xml_node_iterator::operator++(int)
+	{
+		xml_node_iterator temp = *this;
+		++*this;
+		return temp;
+	}
+
+	PUGI__FN const xml_node_iterator& xml_node_iterator::operator--()
+	{
+		_wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
+		return *this;
+	}
+
+	PUGI__FN xml_node_iterator xml_node_iterator::operator--(int)
+	{
+		xml_node_iterator temp = *this;
+		--*this;
+		return temp;
+	}
+
+	PUGI__FN xml_attribute_iterator::xml_attribute_iterator()
+	{
+	}
+
+	PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
+	{
+	}
+
+	PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+	{
+	}
+
+	PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
+	{
+		return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
+	}
+	
+	PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
+	{
+		return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
+	}
+
+	PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const
+	{
+		assert(_wrap._attr);
+		return _wrap;
+	}
+
+	PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const
+	{
+		assert(_wrap._attr);
+		return const_cast<xml_attribute*>(&_wrap); // BCC32 workaround
+	}
+
+	PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++()
+	{
+		assert(_wrap._attr);
+		_wrap._attr = _wrap._attr->next_attribute;
+		return *this;
+	}
+
+	PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int)
+	{
+		xml_attribute_iterator temp = *this;
+		++*this;
+		return temp;
+	}
+
+	PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--()
+	{
+		_wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
+		return *this;
+	}
+
+	PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int)
+	{
+		xml_attribute_iterator temp = *this;
+		--*this;
+		return temp;
+	}
+
+	PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0)
+	{
+	}
+
+	PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _node(node), _name(name)
+	{
+	}
+
+	PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const
+	{
+		return _node == rhs._node;
+	}
+
+	PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const
+	{
+		return _node != rhs._node;
+	}
+
+	PUGI__FN xml_node& xml_named_node_iterator::operator*() const
+	{
+		assert(_node._root);
+		return _node;
+	}
+
+	PUGI__FN xml_node* xml_named_node_iterator::operator->() const
+	{
+		assert(_node._root);
+		return const_cast<xml_node*>(&_node); // BCC32 workaround
+	}
+
+	PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++()
+	{
+		assert(_node._root);
+		_node = _node.next_sibling(_name);
+		return *this;
+	}
+
+	PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int)
+	{
+		xml_named_node_iterator temp = *this;
+		++*this;
+		return temp;
+	}
+
+	PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
+	{
+	}
+
+	PUGI__FN xml_parse_result::operator bool() const
+	{
+		return status == status_ok;
+	}
+
+	PUGI__FN const char* xml_parse_result::description() const
+	{
+		switch (status)
+		{
+		case status_ok: return "No error";
+
+		case status_file_not_found: return "File was not found";
+		case status_io_error: return "Error reading from file/stream";
+		case status_out_of_memory: return "Could not allocate memory";
+		case status_internal_error: return "Internal error occurred";
+
+		case status_unrecognized_tag: return "Could not determine tag type";
+
+		case status_bad_pi: return "Error parsing document declaration/processing instruction";
+		case status_bad_comment: return "Error parsing comment";
+		case status_bad_cdata: return "Error parsing CDATA section";
+		case status_bad_doctype: return "Error parsing document type declaration";
+		case status_bad_pcdata: return "Error parsing PCDATA section";
+		case status_bad_start_element: return "Error parsing start element tag";
+		case status_bad_attribute: return "Error parsing element attribute";
+		case status_bad_end_element: return "Error parsing end element tag";
+		case status_end_element_mismatch: return "Start-end tags mismatch";
+
+		default: return "Unknown error";
+		}
+	}
+
+	PUGI__FN xml_document::xml_document(): _buffer(0)
+	{
+		create();
+	}
+
+	PUGI__FN xml_document::~xml_document()
+	{
+		destroy();
+	}
+
+	PUGI__FN void xml_document::reset()
+	{
+		destroy();
+		create();
+	}
+
+	PUGI__FN void xml_document::reset(const xml_document& proto)
+	{
+		reset();
+
+		for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
+			append_copy(cur);
+	}
+
+	PUGI__FN void xml_document::create()
+	{
+		// initialize sentinel page
+		PUGI__STATIC_ASSERT(offsetof(impl::xml_memory_page, data) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment <= sizeof(_memory));
+
+		// align upwards to page boundary
+		void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1));
+
+		// prepare page structure
+		impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory);
+
+		page->busy_size = impl::xml_memory_page_size;
+
+		// allocate new root
+		_root = new (page->data) impl::xml_document_struct(page);
+		_root->prev_sibling_c = _root;
+
+		// setup sentinel page
+		page->allocator = static_cast<impl::xml_document_struct*>(_root);
+	}
+
+	PUGI__FN void xml_document::destroy()
+	{
+		// destroy static storage
+		if (_buffer)
+		{
+			impl::xml_memory::deallocate(_buffer);
+			_buffer = 0;
+		}
+
+		// destroy dynamic storage, leave sentinel page (it's in static memory)
+		if (_root)
+		{
+			impl::xml_memory_page* root_page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+			assert(root_page && !root_page->prev && !root_page->memory);
+
+			// destroy all pages
+			for (impl::xml_memory_page* page = root_page->next; page; )
+			{
+				impl::xml_memory_page* next = page->next;
+
+				impl::xml_allocator::deallocate_page(page);
+
+				page = next;
+			}
+
+			// cleanup root page
+			root_page->allocator = 0;
+			root_page->next = 0;
+			root_page->busy_size = root_page->freed_size = 0;
+
+			_root = 0;
+		}
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		return impl::load_stream_impl(*this, stream, options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
+	{
+		reset();
+
+		return impl::load_stream_impl(*this, stream, options, encoding_wchar);
+	}
+#endif
+
+	PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
+	{
+		// Force native encoding (skip autodetection)
+	#ifdef PUGIXML_WCHAR_MODE
+		xml_encoding encoding = encoding_wchar;
+	#else
+		xml_encoding encoding = encoding_utf8;
+	#endif
+
+		return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		FILE* file = fopen(path_, "rb");
+
+		return impl::load_file_impl(*this, file, options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding)
+	{
+		reset();
+
+		FILE* file = impl::open_file_wide(path_, L"rb");
+
+		return impl::load_file_impl(*this, file, options, encoding);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own)
+	{
+		reset();
+
+		// check input buffer
+		assert(contents || size == 0);
+
+		// get actual encoding
+		xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size);
+
+		// get private buffer
+		char_t* buffer = 0;
+		size_t length = 0;
+
+		if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory);
+		
+		// delete original buffer if we performed a conversion
+		if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents);
+
+		// parse
+		xml_parse_result res = impl::xml_parser::parse(buffer, length, _root, options);
+
+		// remember encoding
+		res.encoding = buffer_encoding;
+
+		// grab onto buffer if it's our buffer, user is responsible for deallocating contens himself
+		if (own || buffer != contents) _buffer = buffer;
+
+		return res;
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		return load_buffer_impl(const_cast<void*>(contents), size, options, encoding, false, false);
+	}
+
+	PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		return load_buffer_impl(contents, size, options, encoding, true, false);
+	}
+		
+	PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+	{
+		return load_buffer_impl(contents, size, options, encoding, true, true);
+	}
+
+	PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+		if ((flags & format_write_bom) && encoding != encoding_latin1)
+		{
+			// BOM always represents the codepoint U+FEFF, so just write it in native encoding
+		#ifdef PUGIXML_WCHAR_MODE
+			unsigned int bom = 0xfeff;
+			buffered_writer.write(static_cast<wchar_t>(bom));
+		#else
+			buffered_writer.write('\xef', '\xbb', '\xbf');
+		#endif
+		}
+
+		if (!(flags & format_no_declaration) && !impl::has_declaration(*this))
+		{
+			buffered_writer.write(PUGIXML_TEXT("<?xml version=\"1.0\""));
+			if (encoding == encoding_latin1) buffered_writer.write(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
+			buffered_writer.write('?', '>');
+			if (!(flags & format_raw)) buffered_writer.write('\n');
+		}
+
+		impl::node_output(buffered_writer, *this, indent, flags, 0);
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		xml_writer_stream writer(stream);
+
+		save(writer, indent, flags, encoding);
+	}
+
+	PUGI__FN void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
+	{
+		xml_writer_stream writer(stream);
+
+		save(writer, indent, flags, encoding_wchar);
+	}
+#endif
+
+	PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb");
+		return impl::save_file_impl(*this, file, indent, flags, encoding);
+	}
+
+	PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+	{
+		FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb");
+		return impl::save_file_impl(*this, file, indent, flags, encoding);
+	}
+
+	PUGI__FN xml_node xml_document::document_element() const
+	{
+		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+			if ((i->header & impl::xml_memory_page_type_mask) + 1 == node_element)
+				return xml_node(i);
+
+		return xml_node();
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
+	{
+		assert(str);
+
+		return impl::as_utf8_impl(str, wcslen(str));
+	}
+
+	PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t>& str)
+	{
+		return impl::as_utf8_impl(str.c_str(), str.size());
+	}
+	
+	PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const char* str)
+	{
+		assert(str);
+
+		return impl::as_wide_impl(str, strlen(str));
+	}
+	
+	PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const std::string& str)
+	{
+		return impl::as_wide_impl(str.c_str(), str.size());
+	}
+#endif
+
+	PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
+	{
+		impl::xml_memory::allocate = allocate;
+		impl::xml_memory::deallocate = deallocate;
+	}
+
+	PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
+	{
+		return impl::xml_memory::allocate;
+	}
+
+	PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
+	{
+		return impl::xml_memory::deallocate;
+	}
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+	PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::forward_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&)
+	{
+		return std::forward_iterator_tag();
+	}
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection
+	PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&)
+	{
+		return std::bidirectional_iterator_tag();
+	}
+
+	PUGI__FN std::forward_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&)
+	{
+		return std::forward_iterator_tag();
+	}
+}
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+
+// STL replacements
+PUGI__NS_BEGIN
+	struct equal_to
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs == rhs;
+		}
+	};
+
+	struct not_equal_to
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs != rhs;
+		}
+	};
+
+	struct less
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs < rhs;
+		}
+	};
+
+	struct less_equal
+	{
+		template <typename T> bool operator()(const T& lhs, const T& rhs) const
+		{
+			return lhs <= rhs;
+		}
+	};
+
+	template <typename T> void swap(T& lhs, T& rhs)
+	{
+		T temp = lhs;
+		lhs = rhs;
+		rhs = temp;
+	}
+
+	template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
+	{
+		I result = begin;
+
+		for (I it = begin + 1; it != end; ++it)
+			if (pred(*it, *result))
+				result = it;
+
+		return result;
+	}
+
+	template <typename I> void reverse(I begin, I end)
+	{
+		while (begin + 1 < end) swap(*begin++, *--end);
+	}
+
+	template <typename I> I unique(I begin, I end)
+	{
+		// fast skip head
+		while (begin + 1 < end && *begin != *(begin + 1)) begin++;
+
+		if (begin == end) return begin;
+
+		// last written element
+		I write = begin++; 
+
+		// merge unique elements
+		while (begin != end)
+		{
+			if (*begin != *write)
+				*++write = *begin++;
+			else
+				begin++;
+		}
+
+		// past-the-end (write points to live element)
+		return write + 1;
+	}
+
+	template <typename I> void copy_backwards(I begin, I end, I target)
+	{
+		while (begin != end) *--target = *--end;
+	}
+
+	template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
+	{
+		assert(begin != end);
+
+		for (I it = begin + 1; it != end; ++it)
+		{
+			T val = *it;
+
+			if (pred(val, *begin))
+			{
+				// move to front
+				copy_backwards(begin, it, it + 1);
+				*begin = val;
+			}
+			else
+			{
+				I hole = it;
+
+				// move hole backwards
+				while (pred(val, *(hole - 1)))
+				{
+					*hole = *(hole - 1);
+					hole--;
+				}
+
+				// fill hole with element
+				*hole = val;
+			}
+		}
+	}
+
+	// std variant for elements with ==
+	template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
+	{
+		I eqbeg = middle, eqend = middle + 1;
+
+		// expand equal range
+		while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
+		while (eqend != end && *eqend == *eqbeg) ++eqend;
+
+		// process outer elements
+		I ltend = eqbeg, gtbeg = eqend;
+
+		for (;;)
+		{
+			// find the element from the right side that belongs to the left one
+			for (; gtbeg != end; ++gtbeg)
+				if (!pred(*eqbeg, *gtbeg))
+				{
+					if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
+					else break;
+				}
+
+			// find the element from the left side that belongs to the right one
+			for (; ltend != begin; --ltend)
+				if (!pred(*(ltend - 1), *eqbeg))
+				{
+					if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
+					else break;
+				}
+
+			// scanned all elements
+			if (gtbeg == end && ltend == begin)
+			{
+				*out_eqbeg = eqbeg;
+				*out_eqend = eqend;
+				return;
+			}
+
+			// make room for elements by moving equal area
+			if (gtbeg == end)
+			{
+				if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
+				swap(*eqbeg, *--eqend);
+			}
+			else if (ltend == begin)
+			{
+				if (eqend != gtbeg) swap(*eqbeg, *eqend);
+				++eqend;
+				swap(*gtbeg++, *eqbeg++);
+			}
+			else swap(*gtbeg++, *--ltend);
+		}
+	}
+
+	template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
+	{
+		if (pred(*middle, *first)) swap(*middle, *first);
+		if (pred(*last, *middle)) swap(*last, *middle);
+		if (pred(*middle, *first)) swap(*middle, *first);
+	}
+
+	template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
+	{
+		if (last - first <= 40)
+		{
+			// median of three for small chunks
+			median3(first, middle, last, pred);
+		}
+		else
+		{
+			// median of nine
+			size_t step = (last - first + 1) / 8;
+
+			median3(first, first + step, first + 2 * step, pred);
+			median3(middle - step, middle, middle + step, pred);
+			median3(last - 2 * step, last - step, last, pred);
+			median3(first + step, middle, last - step, pred);
+		}
+	}
+
+	template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
+	{
+		// sort large chunks
+		while (end - begin > 32)
+		{
+			// find median element
+			I middle = begin + (end - begin) / 2;
+			median(begin, middle, end - 1, pred);
+
+			// partition in three chunks (< = >)
+			I eqbeg, eqend;
+			partition(begin, middle, end, pred, &eqbeg, &eqend);
+
+			// loop on larger half
+			if (eqbeg - begin > end - eqend)
+			{
+				sort(eqend, end, pred);
+				end = eqbeg;
+			}
+			else
+			{
+				sort(begin, eqbeg, pred);
+				begin = eqend;
+			}
+		}
+
+		// insertion sort small chunk
+		if (begin != end) insertion_sort(begin, end, pred, &*begin);
+	}
+PUGI__NS_END
+
+// Allocator used for AST and evaluation stacks
+PUGI__NS_BEGIN
+	struct xpath_memory_block
+	{	
+		xpath_memory_block* next;
+
+		char data[
+	#ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE
+			PUGIXML_MEMORY_XPATH_PAGE_SIZE
+	#else
+			4096
+	#endif
+		];
+	};
+		
+	class xpath_allocator
+	{
+		xpath_memory_block* _root;
+		size_t _root_size;
+
+	public:
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		jmp_buf* error_handler;
+	#endif
+
+		xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			error_handler = 0;
+		#endif
+		}
+		
+		void* allocate_nothrow(size_t size)
+		{
+			const size_t block_capacity = sizeof(_root->data);
+
+			// align size so that we're able to store pointers in subsequent blocks
+			size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+			if (_root_size + size <= block_capacity)
+			{
+				void* buf = _root->data + _root_size;
+				_root_size += size;
+				return buf;
+			}
+			else
+			{
+				size_t block_data_size = (size > block_capacity) ? size : block_capacity;
+				size_t block_size = block_data_size + offsetof(xpath_memory_block, data);
+
+				xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
+				if (!block) return 0;
+				
+				block->next = _root;
+				
+				_root = block;
+				_root_size = size;
+				
+				return block->data;
+			}
+		}
+
+		void* allocate(size_t size)
+		{
+			void* result = allocate_nothrow(size);
+
+			if (!result)
+			{
+			#ifdef PUGIXML_NO_EXCEPTIONS
+				assert(error_handler);
+				longjmp(*error_handler, 1);
+			#else
+				throw std::bad_alloc();
+			#endif
+			}
+
+			return result;
+		}
+
+		void* reallocate(void* ptr, size_t old_size, size_t new_size)
+		{
+			// align size so that we're able to store pointers in subsequent blocks
+			old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+			new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+			// we can only reallocate the last object
+			assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
+
+			// adjust root size so that we have not allocated the object at all
+			bool only_object = (_root_size == old_size);
+
+			if (ptr) _root_size -= old_size;
+
+			// allocate a new version (this will obviously reuse the memory if possible)
+			void* result = allocate(new_size);
+			assert(result);
+
+			// we have a new block
+			if (result != ptr && ptr)
+			{
+				// copy old data
+				assert(new_size > old_size);
+				memcpy(result, ptr, old_size);
+
+				// free the previous page if it had no other objects
+				if (only_object)
+				{
+					assert(_root->data == result);
+					assert(_root->next);
+
+					xpath_memory_block* next = _root->next->next;
+
+					if (next)
+					{
+						// deallocate the whole page, unless it was the first one
+						xml_memory::deallocate(_root->next);
+						_root->next = next;
+					}
+				}
+			}
+
+			return result;
+		}
+
+		void revert(const xpath_allocator& state)
+		{
+			// free all new pages
+			xpath_memory_block* cur = _root;
+
+			while (cur != state._root)
+			{
+				xpath_memory_block* next = cur->next;
+
+				xml_memory::deallocate(cur);
+
+				cur = next;
+			}
+
+			// restore state
+			_root = state._root;
+			_root_size = state._root_size;
+		}
+
+		void release()
+		{
+			xpath_memory_block* cur = _root;
+			assert(cur);
+
+			while (cur->next)
+			{
+				xpath_memory_block* next = cur->next;
+
+				xml_memory::deallocate(cur);
+
+				cur = next;
+			}
+		}
+	};
+
+	struct xpath_allocator_capture
+	{
+		xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
+		{
+		}
+
+		~xpath_allocator_capture()
+		{
+			_target->revert(_state);
+		}
+
+		xpath_allocator* _target;
+		xpath_allocator _state;
+	};
+
+	struct xpath_stack
+	{
+		xpath_allocator* result;
+		xpath_allocator* temp;
+	};
+
+	struct xpath_stack_data
+	{
+		xpath_memory_block blocks[2];
+		xpath_allocator result;
+		xpath_allocator temp;
+		xpath_stack stack;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		jmp_buf error_handler;
+	#endif
+
+		xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+		{
+			blocks[0].next = blocks[1].next = 0;
+
+			stack.result = &result;
+			stack.temp = &temp;
+
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			result.error_handler = temp.error_handler = &error_handler;
+		#endif
+		}
+
+		~xpath_stack_data()
+		{
+			result.release();
+			temp.release();
+		}
+	};
+PUGI__NS_END
+
+// String class
+PUGI__NS_BEGIN
+	class xpath_string
+	{
+		const char_t* _buffer;
+		bool _uses_heap;
+
+		static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
+		{
+			char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
+			assert(result);
+
+			memcpy(result, string, length * sizeof(char_t));
+			result[length] = 0;
+
+			return result;
+		}
+
+		static char_t* duplicate_string(const char_t* string, xpath_allocator* alloc)
+		{
+			return duplicate_string(string, strlength(string), alloc);
+		}
+
+	public:
+		xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false)
+		{
+		}
+
+		explicit xpath_string(const char_t* str, xpath_allocator* alloc)
+		{
+			bool empty_ = (*str == 0);
+
+			_buffer = empty_ ? PUGIXML_TEXT("") : duplicate_string(str, alloc);
+			_uses_heap = !empty_;
+		}
+
+		explicit xpath_string(const char_t* str, bool use_heap): _buffer(str), _uses_heap(use_heap)
+		{
+		}
+
+		xpath_string(const char_t* begin, const char_t* end, xpath_allocator* alloc)
+		{
+			assert(begin <= end);
+
+			bool empty_ = (begin == end);
+
+			_buffer = empty_ ? PUGIXML_TEXT("") : duplicate_string(begin, static_cast<size_t>(end - begin), alloc);
+			_uses_heap = !empty_;
+		}
+
+		void append(const xpath_string& o, xpath_allocator* alloc)
+		{
+			// skip empty sources
+			if (!*o._buffer) return;
+
+			// fast append for constant empty target and constant source
+			if (!*_buffer && !_uses_heap && !o._uses_heap)
+			{
+				_buffer = o._buffer;
+			}
+			else
+			{
+				// need to make heap copy
+				size_t target_length = strlength(_buffer);
+				size_t source_length = strlength(o._buffer);
+				size_t result_length = target_length + source_length;
+
+				// allocate new buffer
+				char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
+				assert(result);
+
+				// append first string to the new buffer in case there was no reallocation
+				if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
+
+				// append second string to the new buffer
+				memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
+				result[result_length] = 0;
+
+				// finalize
+				_buffer = result;
+				_uses_heap = true;
+			}
+		}
+
+		const char_t* c_str() const
+		{
+			return _buffer;
+		}
+
+		size_t length() const
+		{
+			return strlength(_buffer);
+		}
+		
+		char_t* data(xpath_allocator* alloc)
+		{
+			// make private heap copy
+			if (!_uses_heap)
+			{
+				_buffer = duplicate_string(_buffer, alloc);
+				_uses_heap = true;
+			}
+
+			return const_cast<char_t*>(_buffer);
+		}
+
+		bool empty() const
+		{
+			return *_buffer == 0;
+		}
+
+		bool operator==(const xpath_string& o) const
+		{
+			return strequal(_buffer, o._buffer);
+		}
+
+		bool operator!=(const xpath_string& o) const
+		{
+			return !strequal(_buffer, o._buffer);
+		}
+
+		bool uses_heap() const
+		{
+			return _uses_heap;
+		}
+	};
+
+	PUGI__FN xpath_string xpath_string_const(const char_t* str)
+	{
+		return xpath_string(str, false);
+	}
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+	PUGI__FN bool starts_with(const char_t* string, const char_t* pattern)
+	{
+		while (*pattern && *string == *pattern)
+		{
+			string++;
+			pattern++;
+		}
+
+		return *pattern == 0;
+	}
+
+	PUGI__FN const char_t* find_char(const char_t* s, char_t c)
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcschr(s, c);
+	#else
+		return strchr(s, c);
+	#endif
+	}
+
+	PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p)
+	{
+	#ifdef PUGIXML_WCHAR_MODE
+		// MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
+		return (*p == 0) ? s : wcsstr(s, p);
+	#else
+		return strstr(s, p);
+	#endif
+	}
+
+	// Converts symbol to lower case, if it is an ASCII one
+	PUGI__FN char_t tolower_ascii(char_t ch)
+	{
+		return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
+	}
+
+	PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
+	{
+		if (na.attribute())
+			return xpath_string_const(na.attribute().value());
+		else
+		{
+			const xml_node& n = na.node();
+
+			switch (n.type())
+			{
+			case node_pcdata:
+			case node_cdata:
+			case node_comment:
+			case node_pi:
+				return xpath_string_const(n.value());
+			
+			case node_document:
+			case node_element:
+			{
+				xpath_string result;
+
+				xml_node cur = n.first_child();
+				
+				while (cur && cur != n)
+				{
+					if (cur.type() == node_pcdata || cur.type() == node_cdata)
+						result.append(xpath_string_const(cur.value()), alloc);
+
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (!cur.next_sibling() && cur != n)
+							cur = cur.parent();
+
+						if (cur != n) cur = cur.next_sibling();
+					}
+				}
+				
+				return result;
+			}
+			
+			default:
+				return xpath_string();
+			}
+		}
+	}
+	
+	PUGI__FN unsigned int node_height(xml_node n)
+	{
+		unsigned int result = 0;
+		
+		while (n)
+		{
+			++result;
+			n = n.parent();
+		}
+		
+		return result;
+	}
+	
+	PUGI__FN bool node_is_before(xml_node ln, unsigned int lh, xml_node rn, unsigned int rh)
+	{
+		// normalize heights
+		for (unsigned int i = rh; i < lh; i++) ln = ln.parent();
+		for (unsigned int j = lh; j < rh; j++) rn = rn.parent();
+		
+		// one node is the ancestor of the other
+		if (ln == rn) return lh < rh;
+		
+		// find common ancestor
+		while (ln.parent() != rn.parent())
+		{
+			ln = ln.parent();
+			rn = rn.parent();
+		}
+
+		// there is no common ancestor (the shared parent is null), nodes are from different documents
+		if (!ln.parent()) return ln < rn;
+
+		// determine sibling order
+		for (; ln; ln = ln.next_sibling())
+			if (ln == rn)
+				return true;
+				
+		return false;
+	}
+
+	PUGI__FN bool node_is_ancestor(xml_node parent, xml_node node)
+	{
+		while (node && node != parent) node = node.parent();
+
+		return parent && node == parent;
+	}
+
+	PUGI__FN const void* document_order(const xpath_node& xnode)
+	{
+		xml_node_struct* node = xnode.node().internal_object();
+
+		if (node)
+		{
+			if (node->name && (node->header & xml_memory_page_name_allocated_mask) == 0) return node->name;
+			if (node->value && (node->header & xml_memory_page_value_allocated_mask) == 0) return node->value;
+			return 0;
+		}
+
+		xml_attribute_struct* attr = xnode.attribute().internal_object();
+
+		if (attr)
+		{
+			if ((attr->header & xml_memory_page_name_allocated_mask) == 0) return attr->name;
+			if ((attr->header & xml_memory_page_value_allocated_mask) == 0) return attr->value;
+			return 0;
+		}
+
+		return 0;
+	}
+	
+	struct document_order_comparator
+	{
+		bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+		{
+			// optimized document order based check
+			const void* lo = document_order(lhs);
+			const void* ro = document_order(rhs);
+
+			if (lo && ro) return lo < ro;
+
+			// slow comparison
+			xml_node ln = lhs.node(), rn = rhs.node();
+
+			// compare attributes
+			if (lhs.attribute() && rhs.attribute())
+			{
+				// shared parent
+				if (lhs.parent() == rhs.parent())
+				{
+					// determine sibling order
+					for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
+						if (a == rhs.attribute())
+							return true;
+					
+					return false;
+				}
+				
+				// compare attribute parents
+				ln = lhs.parent();
+				rn = rhs.parent();
+			}
+			else if (lhs.attribute())
+			{
+				// attributes go after the parent element
+				if (lhs.parent() == rhs.node()) return false;
+				
+				ln = lhs.parent();
+			}
+			else if (rhs.attribute())
+			{
+				// attributes go after the parent element
+				if (rhs.parent() == lhs.node()) return true;
+				
+				rn = rhs.parent();
+			}
+
+			if (ln == rn) return false;
+			
+			unsigned int lh = node_height(ln);
+			unsigned int rh = node_height(rn);
+			
+			return node_is_before(ln, lh, rn, rh);
+		}
+	};
+
+	struct duplicate_comparator
+	{
+		bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+		{
+			if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
+			else return rhs.attribute() ? false : lhs.node() < rhs.node();
+		}
+	};
+	
+	PUGI__FN double gen_nan()
+	{
+	#if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
+		union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1];
+		u[0].i = 0x7fc00000;
+		return u[0].f;
+	#else
+		// fallback
+		const volatile double zero = 0.0;
+		return zero / zero;
+	#endif
+	}
+	
+	PUGI__FN bool is_nan(double value)
+	{
+	#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+		return !!_isnan(value);
+	#elif defined(fpclassify) && defined(FP_NAN)
+		return fpclassify(value) == FP_NAN;
+	#else
+		// fallback
+		const volatile double v = value;
+		return v != v;
+	#endif
+	}
+	
+	PUGI__FN const char_t* convert_number_to_string_special(double value)
+	{
+	#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+		if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
+		if (_isnan(value)) return PUGIXML_TEXT("NaN");
+		return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+	#elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
+		switch (fpclassify(value))
+		{
+		case FP_NAN:
+			return PUGIXML_TEXT("NaN");
+
+		case FP_INFINITE:
+			return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+
+		case FP_ZERO:
+			return PUGIXML_TEXT("0");
+
+		default:
+			return 0;
+		}
+	#else
+		// fallback
+		const volatile double v = value;
+
+		if (v == 0) return PUGIXML_TEXT("0");
+		if (v != v) return PUGIXML_TEXT("NaN");
+		if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+		return 0;
+	#endif
+	}
+	
+	PUGI__FN bool convert_number_to_boolean(double value)
+	{
+		return (value != 0 && !is_nan(value));
+	}
+	
+	PUGI__FN void truncate_zeros(char* begin, char* end)
+	{
+		while (begin != end && end[-1] == '0') end--;
+
+		*end = 0;
+	}
+
+	// gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
+#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+	PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+	{
+		// get base values
+		int sign, exponent;
+		_ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+
+		// truncate redundant zeros
+		truncate_zeros(buffer, buffer + strlen(buffer));
+
+		// fill results
+		*out_mantissa = buffer;
+		*out_exponent = exponent;
+	}
+#else
+	PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+	{
+		// get a scientific notation value with IEEE DBL_DIG decimals
+		sprintf(buffer, "%.*e", DBL_DIG, value);
+		assert(strlen(buffer) < buffer_size);
+		(void)!buffer_size;
+
+		// get the exponent (possibly negative)
+		char* exponent_string = strchr(buffer, 'e');
+		assert(exponent_string);
+
+		int exponent = atoi(exponent_string + 1);
+
+		// extract mantissa string: skip sign
+		char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
+		assert(mantissa[0] != '0' && mantissa[1] == '.');
+
+		// divide mantissa by 10 to eliminate integer part
+		mantissa[1] = mantissa[0];
+		mantissa++;
+		exponent++;
+
+		// remove extra mantissa digits and zero-terminate mantissa
+		truncate_zeros(mantissa, exponent_string);
+
+		// fill results
+		*out_mantissa = mantissa;
+		*out_exponent = exponent;
+	}
+#endif
+
+	PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
+	{
+		// try special number conversion
+		const char_t* special = convert_number_to_string_special(value);
+		if (special) return xpath_string_const(special);
+
+		// get mantissa + exponent form
+		char mantissa_buffer[64];
+
+		char* mantissa;
+		int exponent;
+		convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+
+		// make the number!
+		char_t result[512];
+		char_t* s = result;
+
+		// sign
+		if (value < 0) *s++ = '-';
+
+		// integer part
+		if (exponent <= 0)
+		{
+			*s++ = '0';
+		}
+		else
+		{
+			while (exponent > 0)
+			{
+				assert(*mantissa == 0 || static_cast<unsigned int>(*mantissa - '0') <= 9);
+				*s++ = *mantissa ? *mantissa++ : '0';
+				exponent--;
+			}
+		}
+
+		// fractional part
+		if (*mantissa)
+		{
+			// decimal point
+			*s++ = '.';
+
+			// extra zeroes from negative exponent
+			while (exponent < 0)
+			{
+				*s++ = '0';
+				exponent++;
+			}
+
+			// extra mantissa digits
+			while (*mantissa)
+			{
+				assert(static_cast<unsigned int>(*mantissa - '0') <= 9);
+				*s++ = *mantissa++;
+			}
+		}
+
+		// zero-terminate
+		assert(s < result + sizeof(result) / sizeof(result[0]));
+		*s = 0;
+
+		return xpath_string(result, alloc);
+	}
+	
+	PUGI__FN bool check_string_to_number_format(const char_t* string)
+	{
+		// parse leading whitespace
+		while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+		// parse sign
+		if (*string == '-') ++string;
+
+		if (!*string) return false;
+
+		// if there is no integer part, there should be a decimal part with at least one digit
+		if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false;
+
+		// parse integer part
+		while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+
+		// parse decimal part
+		if (*string == '.')
+		{
+			++string;
+
+			while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+		}
+
+		// parse trailing whitespace
+		while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+		return *string == 0;
+	}
+
+	PUGI__FN double convert_string_to_number(const char_t* string)
+	{
+		// check string format
+		if (!check_string_to_number_format(string)) return gen_nan();
+
+		// parse string
+	#ifdef PUGIXML_WCHAR_MODE
+		return wcstod(string, 0);
+	#else
+		return atof(string);
+	#endif
+	}
+
+	PUGI__FN bool convert_string_to_number(const char_t* begin, const char_t* end, double* out_result)
+	{
+		char_t buffer[32];
+
+		size_t length = static_cast<size_t>(end - begin);
+		char_t* scratch = buffer;
+
+		if (length >= sizeof(buffer) / sizeof(buffer[0]))
+		{
+			// need to make dummy on-heap copy
+			scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+			if (!scratch) return false;
+		}
+
+		// copy string to zero-terminated buffer and perform conversion
+		memcpy(scratch, begin, length * sizeof(char_t));
+		scratch[length] = 0;
+
+		*out_result = convert_string_to_number(scratch);
+
+		// free dummy buffer
+		if (scratch != buffer) xml_memory::deallocate(scratch);
+
+		return true;
+	}
+	
+	PUGI__FN double round_nearest(double value)
+	{
+		return floor(value + 0.5);
+	}
+
+	PUGI__FN double round_nearest_nzero(double value)
+	{
+		// same as round_nearest, but returns -0 for [-0.5, -0]
+		// ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
+		return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
+	}
+	
+	PUGI__FN const char_t* qualified_name(const xpath_node& node)
+	{
+		return node.attribute() ? node.attribute().name() : node.node().name();
+	}
+	
+	PUGI__FN const char_t* local_name(const xpath_node& node)
+	{
+		const char_t* name = qualified_name(node);
+		const char_t* p = find_char(name, ':');
+		
+		return p ? p + 1 : name;
+	}
+
+	struct namespace_uri_predicate
+	{
+		const char_t* prefix;
+		size_t prefix_length;
+
+		namespace_uri_predicate(const char_t* name)
+		{
+			const char_t* pos = find_char(name, ':');
+
+			prefix = pos ? name : 0;
+			prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
+		}
+
+		bool operator()(const xml_attribute& a) const
+		{
+			const char_t* name = a.name();
+
+			if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
+
+			return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
+		}
+	};
+
+	PUGI__FN const char_t* namespace_uri(const xml_node& node)
+	{
+		namespace_uri_predicate pred = node.name();
+		
+		xml_node p = node;
+		
+		while (p)
+		{
+			xml_attribute a = p.find_attribute(pred);
+			
+			if (a) return a.value();
+			
+			p = p.parent();
+		}
+		
+		return PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* namespace_uri(const xml_attribute& attr, const xml_node& parent)
+	{
+		namespace_uri_predicate pred = attr.name();
+		
+		// Default namespace does not apply to attributes
+		if (!pred.prefix) return PUGIXML_TEXT("");
+		
+		xml_node p = parent;
+		
+		while (p)
+		{
+			xml_attribute a = p.find_attribute(pred);
+			
+			if (a) return a.value();
+			
+			p = p.parent();
+		}
+		
+		return PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const char_t* namespace_uri(const xpath_node& node)
+	{
+		return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
+	}
+
+	PUGI__FN void normalize_space(char_t* buffer)
+	{
+		char_t* write = buffer;
+
+		for (char_t* it = buffer; *it; )
+		{
+			char_t ch = *it++;
+
+			if (PUGI__IS_CHARTYPE(ch, ct_space))
+			{
+				// replace whitespace sequence with single space
+				while (PUGI__IS_CHARTYPE(*it, ct_space)) it++;
+
+				// avoid leading spaces
+				if (write != buffer) *write++ = ' ';
+			}
+			else *write++ = ch;
+		}
+
+		// remove trailing space
+		if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--;
+
+		// zero-terminate
+		*write = 0;
+	}
+
+	PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to)
+	{
+		size_t to_length = strlength(to);
+
+		char_t* write = buffer;
+
+		while (*buffer)
+		{
+			PUGI__DMC_VOLATILE char_t ch = *buffer++;
+
+			const char_t* pos = find_char(from, ch);
+
+			if (!pos)
+				*write++ = ch; // do not process
+			else if (static_cast<size_t>(pos - from) < to_length)
+				*write++ = to[pos - from]; // replace
+		}
+
+		// zero-terminate
+		*write = 0;
+	}
+
+	struct xpath_variable_boolean: xpath_variable
+	{
+		xpath_variable_boolean(): value(false)
+		{
+		}
+
+		bool value;
+		char_t name[1];
+	};
+
+	struct xpath_variable_number: xpath_variable
+	{
+		xpath_variable_number(): value(0)
+		{
+		}
+
+		double value;
+		char_t name[1];
+	};
+
+	struct xpath_variable_string: xpath_variable
+	{
+		xpath_variable_string(): value(0)
+		{
+		}
+
+		~xpath_variable_string()
+		{
+			if (value) xml_memory::deallocate(value);
+		}
+
+		char_t* value;
+		char_t name[1];
+	};
+
+	struct xpath_variable_node_set: xpath_variable
+	{
+		xpath_node_set value;
+		char_t name[1];
+	};
+
+	static const xpath_node_set dummy_node_set;
+
+	PUGI__FN unsigned int hash_string(const char_t* str)
+	{
+		// Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
+		unsigned int result = 0;
+
+		while (*str)
+		{
+			result += static_cast<unsigned int>(*str++);
+			result += result << 10;
+			result ^= result >> 6;
+		}
+	
+		result += result << 3;
+		result ^= result >> 11;
+		result += result << 15;
+	
+		return result;
+	}
+
+	template <typename T> PUGI__FN T* new_xpath_variable(const char_t* name)
+	{
+		size_t length = strlength(name);
+		if (length == 0) return 0; // empty variable names are invalid
+
+		// $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
+		void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t));
+		if (!memory) return 0;
+
+		T* result = new (memory) T();
+
+		memcpy(result->name, name, (length + 1) * sizeof(char_t));
+
+		return result;
+	}
+
+	PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
+	{
+		switch (type)
+		{
+		case xpath_type_node_set:
+			return new_xpath_variable<xpath_variable_node_set>(name);
+
+		case xpath_type_number:
+			return new_xpath_variable<xpath_variable_number>(name);
+
+		case xpath_type_string:
+			return new_xpath_variable<xpath_variable_string>(name);
+
+		case xpath_type_boolean:
+			return new_xpath_variable<xpath_variable_boolean>(name);
+
+		default:
+			return 0;
+		}
+	}
+
+	template <typename T> PUGI__FN void delete_xpath_variable(T* var)
+	{
+		var->~T();
+		xml_memory::deallocate(var);
+	}
+
+	PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
+	{
+		switch (type)
+		{
+		case xpath_type_node_set:
+			delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
+			break;
+
+		case xpath_type_number:
+			delete_xpath_variable(static_cast<xpath_variable_number*>(var));
+			break;
+
+		case xpath_type_string:
+			delete_xpath_variable(static_cast<xpath_variable_string*>(var));
+			break;
+
+		case xpath_type_boolean:
+			delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
+			break;
+
+		default:
+			assert(!"Invalid variable type");
+		}
+	}
+
+	PUGI__FN xpath_variable* get_variable(xpath_variable_set* set, const char_t* begin, const char_t* end)
+	{
+		char_t buffer[32];
+
+		size_t length = static_cast<size_t>(end - begin);
+		char_t* scratch = buffer;
+
+		if (length >= sizeof(buffer) / sizeof(buffer[0]))
+		{
+			// need to make dummy on-heap copy
+			scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+			if (!scratch) return 0;
+		}
+
+		// copy string to zero-terminated buffer and perform lookup
+		memcpy(scratch, begin, length * sizeof(char_t));
+		scratch[length] = 0;
+
+		xpath_variable* result = set->get(scratch);
+
+		// free dummy buffer
+		if (scratch != buffer) xml_memory::deallocate(scratch);
+
+		return result;
+	}
+PUGI__NS_END
+
+// Internal node set class
+PUGI__NS_BEGIN
+	PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
+	{
+		xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+		if (type == xpath_node_set::type_unsorted)
+		{
+			sort(begin, end, document_order_comparator());
+
+			type = xpath_node_set::type_sorted;
+		}
+		
+		if (type != order) reverse(begin, end);
+			
+		return order;
+	}
+
+	PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
+	{
+		if (begin == end) return xpath_node();
+
+		switch (type)
+		{
+		case xpath_node_set::type_sorted:
+			return *begin;
+
+		case xpath_node_set::type_sorted_reverse:
+			return *(end - 1);
+
+		case xpath_node_set::type_unsorted:
+			return *min_element(begin, end, document_order_comparator());
+
+		default:
+			assert(!"Invalid node set type");
+			return xpath_node();
+		}
+	}
+
+	class xpath_node_set_raw
+	{
+		xpath_node_set::type_t _type;
+
+		xpath_node* _begin;
+		xpath_node* _end;
+		xpath_node* _eos;
+
+	public:
+		xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
+		{
+		}
+
+		xpath_node* begin() const
+		{
+			return _begin;
+		}
+
+		xpath_node* end() const
+		{
+			return _end;
+		}
+
+		bool empty() const
+		{
+			return _begin == _end;
+		}
+
+		size_t size() const
+		{
+			return static_cast<size_t>(_end - _begin);
+		}
+
+		xpath_node first() const
+		{
+			return xpath_first(_begin, _end, _type);
+		}
+
+		void push_back(const xpath_node& node, xpath_allocator* alloc)
+		{
+			if (_end == _eos)
+			{
+				size_t capacity = static_cast<size_t>(_eos - _begin);
+
+				// get new capacity (1.5x rule)
+				size_t new_capacity = capacity + capacity / 2 + 1;
+
+				// reallocate the old array or allocate a new one
+				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
+				assert(data);
+
+				// finalize
+				_begin = data;
+				_end = data + capacity;
+				_eos = data + new_capacity;
+			}
+
+			*_end++ = node;
+		}
+
+		void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc)
+		{
+			size_t size_ = static_cast<size_t>(_end - _begin);
+			size_t capacity = static_cast<size_t>(_eos - _begin);
+			size_t count = static_cast<size_t>(end_ - begin_);
+
+			if (size_ + count > capacity)
+			{
+				// reallocate the old array or allocate a new one
+				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
+				assert(data);
+
+				// finalize
+				_begin = data;
+				_end = data + size_;
+				_eos = data + size_ + count;
+			}
+
+			memcpy(_end, begin_, count * sizeof(xpath_node));
+			_end += count;
+		}
+
+		void sort_do()
+		{
+			_type = xpath_sort(_begin, _end, _type, false);
+		}
+
+		void truncate(xpath_node* pos)
+		{
+			assert(_begin <= pos && pos <= _end);
+
+			_end = pos;
+		}
+
+		void remove_duplicates()
+		{
+			if (_type == xpath_node_set::type_unsorted)
+				sort(_begin, _end, duplicate_comparator());
+		
+			_end = unique(_begin, _end);
+		}
+
+		xpath_node_set::type_t type() const
+		{
+			return _type;
+		}
+
+		void set_type(xpath_node_set::type_t value)
+		{
+			_type = value;
+		}
+	};
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+	struct xpath_context
+	{
+		xpath_node n;
+		size_t position, size;
+
+		xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_)
+		{
+		}
+	};
+
+	enum lexeme_t
+	{
+		lex_none = 0,
+		lex_equal,
+		lex_not_equal,
+		lex_less,
+		lex_greater,
+		lex_less_or_equal,
+		lex_greater_or_equal,
+		lex_plus,
+		lex_minus,
+		lex_multiply,
+		lex_union,
+		lex_var_ref,
+		lex_open_brace,
+		lex_close_brace,
+		lex_quoted_string,
+		lex_number,
+		lex_slash,
+		lex_double_slash,
+		lex_open_square_brace,
+		lex_close_square_brace,
+		lex_string,
+		lex_comma,
+		lex_axis_attribute,
+		lex_dot,
+		lex_double_dot,
+		lex_double_colon,
+		lex_eof
+	};
+
+	struct xpath_lexer_string
+	{
+		const char_t* begin;
+		const char_t* end;
+
+		xpath_lexer_string(): begin(0), end(0)
+		{
+		}
+
+		bool operator==(const char_t* other) const
+		{
+			size_t length = static_cast<size_t>(end - begin);
+
+			return strequalrange(other, begin, length);
+		}
+	};
+
+	class xpath_lexer
+	{
+		const char_t* _cur;
+		const char_t* _cur_lexeme_pos;
+		xpath_lexer_string _cur_lexeme_contents;
+
+		lexeme_t _cur_lexeme;
+
+	public:
+		explicit xpath_lexer(const char_t* query): _cur(query)
+		{
+			next();
+		}
+		
+		const char_t* state() const
+		{
+			return _cur;
+		}
+		
+		void next()
+		{
+			const char_t* cur = _cur;
+
+			while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur;
+
+			// save lexeme position for error reporting
+			_cur_lexeme_pos = cur;
+
+			switch (*cur)
+			{
+			case 0:
+				_cur_lexeme = lex_eof;
+				break;
+			
+			case '>':
+				if (*(cur+1) == '=')
+				{
+					cur += 2;
+					_cur_lexeme = lex_greater_or_equal;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_greater;
+				}
+				break;
+
+			case '<':
+				if (*(cur+1) == '=')
+				{
+					cur += 2;
+					_cur_lexeme = lex_less_or_equal;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_less;
+				}
+				break;
+
+			case '!':
+				if (*(cur+1) == '=')
+				{
+					cur += 2;
+					_cur_lexeme = lex_not_equal;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+				break;
+
+			case '=':
+				cur += 1;
+				_cur_lexeme = lex_equal;
+
+				break;
+			
+			case '+':
+				cur += 1;
+				_cur_lexeme = lex_plus;
+
+				break;
+
+			case '-':
+				cur += 1;
+				_cur_lexeme = lex_minus;
+
+				break;
+
+			case '*':
+				cur += 1;
+				_cur_lexeme = lex_multiply;
+
+				break;
+
+			case '|':
+				cur += 1;
+				_cur_lexeme = lex_union;
+
+				break;
+			
+			case '$':
+				cur += 1;
+
+				if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+				{
+					_cur_lexeme_contents.begin = cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+					if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
+					{
+						cur++; // :
+
+						while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+					}
+
+					_cur_lexeme_contents.end = cur;
+				
+					_cur_lexeme = lex_var_ref;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+
+				break;
+
+			case '(':
+				cur += 1;
+				_cur_lexeme = lex_open_brace;
+
+				break;
+
+			case ')':
+				cur += 1;
+				_cur_lexeme = lex_close_brace;
+
+				break;
+			
+			case '[':
+				cur += 1;
+				_cur_lexeme = lex_open_square_brace;
+
+				break;
+
+			case ']':
+				cur += 1;
+				_cur_lexeme = lex_close_square_brace;
+
+				break;
+
+			case ',':
+				cur += 1;
+				_cur_lexeme = lex_comma;
+
+				break;
+
+			case '/':
+				if (*(cur+1) == '/')
+				{
+					cur += 2;
+					_cur_lexeme = lex_double_slash;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_slash;
+				}
+				break;
+		
+			case '.':
+				if (*(cur+1) == '.')
+				{
+					cur += 2;
+					_cur_lexeme = lex_double_dot;
+				}
+				else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit))
+				{
+					_cur_lexeme_contents.begin = cur; // .
+
+					++cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+					_cur_lexeme_contents.end = cur;
+					
+					_cur_lexeme = lex_number;
+				}
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_dot;
+				}
+				break;
+
+			case '@':
+				cur += 1;
+				_cur_lexeme = lex_axis_attribute;
+
+				break;
+
+			case '"':
+			case '\'':
+			{
+				char_t terminator = *cur;
+
+				++cur;
+
+				_cur_lexeme_contents.begin = cur;
+				while (*cur && *cur != terminator) cur++;
+				_cur_lexeme_contents.end = cur;
+				
+				if (!*cur)
+					_cur_lexeme = lex_none;
+				else
+				{
+					cur += 1;
+					_cur_lexeme = lex_quoted_string;
+				}
+
+				break;
+			}
+
+			case ':':
+				if (*(cur+1) == ':')
+				{
+					cur += 2;
+					_cur_lexeme = lex_double_colon;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+				break;
+
+			default:
+				if (PUGI__IS_CHARTYPEX(*cur, ctx_digit))
+				{
+					_cur_lexeme_contents.begin = cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+				
+					if (*cur == '.')
+					{
+						cur++;
+
+						while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+					}
+
+					_cur_lexeme_contents.end = cur;
+
+					_cur_lexeme = lex_number;
+				}
+				else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+				{
+					_cur_lexeme_contents.begin = cur;
+
+					while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+					if (cur[0] == ':')
+					{
+						if (cur[1] == '*') // namespace test ncname:*
+						{
+							cur += 2; // :*
+						}
+						else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
+						{
+							cur++; // :
+
+							while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+						}
+					}
+
+					_cur_lexeme_contents.end = cur;
+				
+					_cur_lexeme = lex_string;
+				}
+				else
+				{
+					_cur_lexeme = lex_none;
+				}
+			}
+
+			_cur = cur;
+		}
+
+		lexeme_t current() const
+		{
+			return _cur_lexeme;
+		}
+
+		const char_t* current_pos() const
+		{
+			return _cur_lexeme_pos;
+		}
+
+		const xpath_lexer_string& contents() const
+		{
+			assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
+
+			return _cur_lexeme_contents;
+		}
+	};
+
+	enum ast_type_t
+	{
+		ast_op_or,						// left or right
+		ast_op_and,						// left and right
+		ast_op_equal,					// left = right
+		ast_op_not_equal,				// left != right
+		ast_op_less,					// left < right
+		ast_op_greater,					// left > right
+		ast_op_less_or_equal,			// left <= right
+		ast_op_greater_or_equal,		// left >= right
+		ast_op_add,						// left + right
+		ast_op_subtract,				// left - right
+		ast_op_multiply,				// left * right
+		ast_op_divide,					// left / right
+		ast_op_mod,						// left % right
+		ast_op_negate,					// left - right
+		ast_op_union,					// left | right
+		ast_predicate,					// apply predicate to set; next points to next predicate
+		ast_filter,						// select * from left where right
+		ast_filter_posinv,				// select * from left where right; proximity position invariant
+		ast_string_constant,			// string constant
+		ast_number_constant,			// number constant
+		ast_variable,					// variable
+		ast_func_last,					// last()
+		ast_func_position,				// position()
+		ast_func_count,					// count(left)
+		ast_func_id,					// id(left)
+		ast_func_local_name_0,			// local-name()
+		ast_func_local_name_1,			// local-name(left)
+		ast_func_namespace_uri_0,		// namespace-uri()
+		ast_func_namespace_uri_1,		// namespace-uri(left)
+		ast_func_name_0,				// name()
+		ast_func_name_1,				// name(left)
+		ast_func_string_0,				// string()
+		ast_func_string_1,				// string(left)
+		ast_func_concat,				// concat(left, right, siblings)
+		ast_func_starts_with,			// starts_with(left, right)
+		ast_func_contains,				// contains(left, right)
+		ast_func_substring_before,		// substring-before(left, right)
+		ast_func_substring_after,		// substring-after(left, right)
+		ast_func_substring_2,			// substring(left, right)
+		ast_func_substring_3,			// substring(left, right, third)
+		ast_func_string_length_0,		// string-length()
+		ast_func_string_length_1,		// string-length(left)
+		ast_func_normalize_space_0,		// normalize-space()
+		ast_func_normalize_space_1,		// normalize-space(left)
+		ast_func_translate,				// translate(left, right, third)
+		ast_func_boolean,				// boolean(left)
+		ast_func_not,					// not(left)
+		ast_func_true,					// true()
+		ast_func_false,					// false()
+		ast_func_lang,					// lang(left)
+		ast_func_number_0,				// number()
+		ast_func_number_1,				// number(left)
+		ast_func_sum,					// sum(left)
+		ast_func_floor,					// floor(left)
+		ast_func_ceiling,				// ceiling(left)
+		ast_func_round,					// round(left)
+		ast_step,						// process set left with step
+		ast_step_root					// select root node
+	};
+
+	enum axis_t
+	{
+		axis_ancestor,
+		axis_ancestor_or_self,
+		axis_attribute,
+		axis_child,
+		axis_descendant,
+		axis_descendant_or_self,
+		axis_following,
+		axis_following_sibling,
+		axis_namespace,
+		axis_parent,
+		axis_preceding,
+		axis_preceding_sibling,
+		axis_self
+	};
+	
+	enum nodetest_t
+	{
+		nodetest_none,
+		nodetest_name,
+		nodetest_type_node,
+		nodetest_type_comment,
+		nodetest_type_pi,
+		nodetest_type_text,
+		nodetest_pi,
+		nodetest_all,
+		nodetest_all_in_namespace
+	};
+
+	template <axis_t N> struct axis_to_type
+	{
+		static const axis_t axis;
+	};
+
+	template <axis_t N> const axis_t axis_to_type<N>::axis = N;
+		
+	class xpath_ast_node
+	{
+	private:
+		// node type
+		char _type;
+		char _rettype;
+
+		// for ast_step / ast_predicate
+		char _axis;
+		char _test;
+
+		// tree node structure
+		xpath_ast_node* _left;
+		xpath_ast_node* _right;
+		xpath_ast_node* _next;
+
+		union
+		{
+			// value for ast_string_constant
+			const char_t* string;
+			// value for ast_number_constant
+			double number;
+			// variable for ast_variable
+			xpath_variable* variable;
+			// node test for ast_step (node name/namespace/node type/pi target)
+			const char_t* nodetest;
+		} _data;
+
+		xpath_ast_node(const xpath_ast_node&);
+		xpath_ast_node& operator=(const xpath_ast_node&);
+
+		template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+		{
+			xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+			if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+			{
+				if (lt == xpath_type_boolean || rt == xpath_type_boolean)
+					return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+				else if (lt == xpath_type_number || rt == xpath_type_number)
+					return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+				else if (lt == xpath_type_string || rt == xpath_type_string)
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					xpath_string ls = lhs->eval_string(c, stack);
+					xpath_string rs = rhs->eval_string(c, stack);
+
+					return comp(ls, rs);
+				}
+			}
+			else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture cri(stack.result);
+
+						if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
+							return true;
+					}
+
+				return false;
+			}
+			else
+			{
+				if (lt == xpath_type_node_set)
+				{
+					swap(lhs, rhs);
+					swap(lt, rt);
+				}
+
+				if (lt == xpath_type_boolean)
+					return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+				else if (lt == xpath_type_number)
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					double l = lhs->eval_number(c, stack);
+					xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture cri(stack.result);
+
+						if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+							return true;
+					}
+
+					return false;
+				}
+				else if (lt == xpath_type_string)
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					xpath_string l = lhs->eval_string(c, stack);
+					xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture cri(stack.result);
+
+						if (comp(l, string_value(*ri, stack.result)))
+							return true;
+					}
+
+					return false;
+				}
+			}
+
+			assert(!"Wrong types");
+			return false;
+		}
+
+		template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+		{
+			xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+			if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+				return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+			else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					double l = convert_string_to_number(string_value(*li, stack.result).c_str());
+
+					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+					{
+						xpath_allocator_capture crii(stack.result);
+
+						if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+							return true;
+					}
+				}
+
+				return false;
+			}
+			else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				double l = lhs->eval_number(c, stack);
+				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
+
+				for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+						return true;
+				}
+
+				return false;
+			}
+			else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
+				double r = rhs->eval_number(c, stack);
+
+				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
+						return true;
+				}
+
+				return false;
+			}
+			else
+			{
+				assert(!"Wrong types");
+				return false;
+			}
+		}
+
+		void apply_predicate(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+		{
+			assert(ns.size() >= first);
+
+			size_t i = 1;
+			size_t size = ns.size() - first;
+				
+			xpath_node* last = ns.begin() + first;
+				
+			// remove_if... or well, sort of
+			for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+			{
+				xpath_context c(*it, i, size);
+			
+				if (expr->rettype() == xpath_type_number)
+				{
+					if (expr->eval_number(c, stack) == i)
+						*last++ = *it;
+				}
+				else if (expr->eval_boolean(c, stack))
+					*last++ = *it;
+			}
+			
+			ns.truncate(last);
+		}
+
+		void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack)
+		{
+			if (ns.size() == first) return;
+			
+			for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
+			{
+				apply_predicate(ns, first, pred->_left, stack);
+			}
+		}
+
+		void step_push(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& parent, xpath_allocator* alloc)
+		{
+			if (!a) return;
+
+			const char_t* name = a.name();
+
+			// There are no attribute nodes corresponding to attributes that declare namespaces
+			// That is, "xmlns:..." or "xmlns"
+			if (starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')) return;
+			
+			switch (_test)
+			{
+			case nodetest_name:
+				if (strequal(name, _data.nodetest)) ns.push_back(xpath_node(a, parent), alloc);
+				break;
+				
+			case nodetest_type_node:
+			case nodetest_all:
+				ns.push_back(xpath_node(a, parent), alloc);
+				break;
+				
+			case nodetest_all_in_namespace:
+				if (starts_with(name, _data.nodetest))
+					ns.push_back(xpath_node(a, parent), alloc);
+				break;
+			
+			default:
+				;
+			}
+		}
+		
+		void step_push(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc)
+		{
+			if (!n) return;
+
+			switch (_test)
+			{
+			case nodetest_name:
+				if (n.type() == node_element && strequal(n.name(), _data.nodetest)) ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_node:
+				ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_comment:
+				if (n.type() == node_comment)
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_text:
+				if (n.type() == node_pcdata || n.type() == node_cdata)
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_type_pi:
+				if (n.type() == node_pi)
+					ns.push_back(n, alloc);
+				break;
+									
+			case nodetest_pi:
+				if (n.type() == node_pi && strequal(n.name(), _data.nodetest))
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_all:
+				if (n.type() == node_element)
+					ns.push_back(n, alloc);
+				break;
+				
+			case nodetest_all_in_namespace:
+				if (n.type() == node_element && starts_with(n.name(), _data.nodetest))
+					ns.push_back(n, alloc);
+				break;
+
+			default:
+				assert(!"Unknown axis");
+			} 
+		}
+
+		template <class T> void step_fill(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc, T)
+		{
+			const axis_t axis = T::axis;
+
+			switch (axis)
+			{
+			case axis_attribute:
+			{
+				for (xml_attribute a = n.first_attribute(); a; a = a.next_attribute())
+					step_push(ns, a, n, alloc);
+				
+				break;
+			}
+			
+			case axis_child:
+			{
+				for (xml_node c = n.first_child(); c; c = c.next_sibling())
+					step_push(ns, c, alloc);
+					
+				break;
+			}
+			
+			case axis_descendant:
+			case axis_descendant_or_self:
+			{
+				if (axis == axis_descendant_or_self)
+					step_push(ns, n, alloc);
+					
+				xml_node cur = n.first_child();
+				
+				while (cur && cur != n)
+				{
+					step_push(ns, cur, alloc);
+					
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (!cur.next_sibling() && cur != n)
+							cur = cur.parent();
+					
+						if (cur != n) cur = cur.next_sibling();
+					}
+				}
+				
+				break;
+			}
+			
+			case axis_following_sibling:
+			{
+				for (xml_node c = n.next_sibling(); c; c = c.next_sibling())
+					step_push(ns, c, alloc);
+				
+				break;
+			}
+			
+			case axis_preceding_sibling:
+			{
+				for (xml_node c = n.previous_sibling(); c; c = c.previous_sibling())
+					step_push(ns, c, alloc);
+				
+				break;
+			}
+			
+			case axis_following:
+			{
+				xml_node cur = n;
+
+				// exit from this node so that we don't include descendants
+				while (cur && !cur.next_sibling()) cur = cur.parent();
+				cur = cur.next_sibling();
+
+				for (;;)
+				{
+					step_push(ns, cur, alloc);
+
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (cur && !cur.next_sibling()) cur = cur.parent();
+						cur = cur.next_sibling();
+
+						if (!cur) break;
+					}
+				}
+
+				break;
+			}
+
+			case axis_preceding:
+			{
+				xml_node cur = n;
+
+				while (cur && !cur.previous_sibling()) cur = cur.parent();
+				cur = cur.previous_sibling();
+
+				for (;;)
+				{
+					if (cur.last_child())
+						cur = cur.last_child();
+					else
+					{
+						// leaf node, can't be ancestor
+						step_push(ns, cur, alloc);
+
+						if (cur.previous_sibling())
+							cur = cur.previous_sibling();
+						else
+						{
+							do 
+							{
+								cur = cur.parent();
+								if (!cur) break;
+
+								if (!node_is_ancestor(cur, n)) step_push(ns, cur, alloc);
+							}
+							while (!cur.previous_sibling());
+
+							cur = cur.previous_sibling();
+
+							if (!cur) break;
+						}
+					}
+				}
+
+				break;
+			}
+			
+			case axis_ancestor:
+			case axis_ancestor_or_self:
+			{
+				if (axis == axis_ancestor_or_self)
+					step_push(ns, n, alloc);
+
+				xml_node cur = n.parent();
+				
+				while (cur)
+				{
+					step_push(ns, cur, alloc);
+					
+					cur = cur.parent();
+				}
+				
+				break;
+			}
+
+			case axis_self:
+			{
+				step_push(ns, n, alloc);
+
+				break;
+			}
+
+			case axis_parent:
+			{
+				if (n.parent()) step_push(ns, n.parent(), alloc);
+
+				break;
+			}
+				
+			default:
+				assert(!"Unimplemented axis");
+			}
+		}
+		
+		template <class T> void step_fill(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& p, xpath_allocator* alloc, T v)
+		{
+			const axis_t axis = T::axis;
+
+			switch (axis)
+			{
+			case axis_ancestor:
+			case axis_ancestor_or_self:
+			{
+				if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
+					step_push(ns, a, p, alloc);
+
+				xml_node cur = p;
+				
+				while (cur)
+				{
+					step_push(ns, cur, alloc);
+					
+					cur = cur.parent();
+				}
+				
+				break;
+			}
+
+			case axis_descendant_or_self:
+			case axis_self:
+			{
+				if (_test == nodetest_type_node) // reject attributes based on principal node type test
+					step_push(ns, a, p, alloc);
+
+				break;
+			}
+
+			case axis_following:
+			{
+				xml_node cur = p;
+				
+				for (;;)
+				{
+					if (cur.first_child())
+						cur = cur.first_child();
+					else if (cur.next_sibling())
+						cur = cur.next_sibling();
+					else
+					{
+						while (cur && !cur.next_sibling()) cur = cur.parent();
+						cur = cur.next_sibling();
+						
+						if (!cur) break;
+					}
+
+					step_push(ns, cur, alloc);
+				}
+
+				break;
+			}
+
+			case axis_parent:
+			{
+				step_push(ns, p, alloc);
+
+				break;
+			}
+
+			case axis_preceding:
+			{
+				// preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
+				step_fill(ns, p, alloc, v);
+				break;
+			}
+			
+			default:
+				assert(!"Unimplemented axis");
+			}
+		}
+		
+		template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, T v)
+		{
+			const axis_t axis = T::axis;
+			bool attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
+
+			xpath_node_set_raw ns;
+			ns.set_type((axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling) ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted);
+
+			if (_left)
+			{
+				xpath_node_set_raw s = _left->eval_node_set(c, stack);
+
+				// self axis preserves the original order
+				if (axis == axis_self) ns.set_type(s.type());
+
+				for (const xpath_node* it = s.begin(); it != s.end(); ++it)
+				{
+					size_t size = ns.size();
+
+					// in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
+					if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
+					
+					if (it->node())
+						step_fill(ns, it->node(), stack.result, v);
+					else if (attributes)
+						step_fill(ns, it->attribute(), it->parent(), stack.result, v);
+						
+					apply_predicates(ns, size, stack);
+				}
+			}
+			else
+			{
+				if (c.n.node())
+					step_fill(ns, c.n.node(), stack.result, v);
+				else if (attributes)
+					step_fill(ns, c.n.attribute(), c.n.parent(), stack.result, v);
+				
+				apply_predicates(ns, 0, stack);
+			}
+
+			// child, attribute and self axes always generate unique set of nodes
+			// for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
+			if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
+				ns.remove_duplicates();
+
+			return ns;
+		}
+		
+	public:
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+		{
+			assert(type == ast_string_constant);
+			_data.string = value;
+		}
+
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+		{
+			assert(type == ast_number_constant);
+			_data.number = value;
+		}
+		
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+		{
+			assert(type == ast_variable);
+			_data.variable = value;
+		}
+		
+		xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
+			_type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0)
+		{
+		}
+
+		xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
+			_type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
+		{
+			_data.nodetest = contents;
+		}
+
+		void set_next(xpath_ast_node* value)
+		{
+			_next = value;
+		}
+
+		void set_right(xpath_ast_node* value)
+		{
+			_right = value;
+		}
+
+		bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_op_or:
+				return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
+				
+			case ast_op_and:
+				return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
+				
+			case ast_op_equal:
+				return compare_eq(_left, _right, c, stack, equal_to());
+
+			case ast_op_not_equal:
+				return compare_eq(_left, _right, c, stack, not_equal_to());
+	
+			case ast_op_less:
+				return compare_rel(_left, _right, c, stack, less());
+			
+			case ast_op_greater:
+				return compare_rel(_right, _left, c, stack, less());
+
+			case ast_op_less_or_equal:
+				return compare_rel(_left, _right, c, stack, less_equal());
+			
+			case ast_op_greater_or_equal:
+				return compare_rel(_right, _left, c, stack, less_equal());
+
+			case ast_func_starts_with:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_string lr = _left->eval_string(c, stack);
+				xpath_string rr = _right->eval_string(c, stack);
+
+				return starts_with(lr.c_str(), rr.c_str());
+			}
+
+			case ast_func_contains:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_string lr = _left->eval_string(c, stack);
+				xpath_string rr = _right->eval_string(c, stack);
+
+				return find_substring(lr.c_str(), rr.c_str()) != 0;
+			}
+
+			case ast_func_boolean:
+				return _left->eval_boolean(c, stack);
+				
+			case ast_func_not:
+				return !_left->eval_boolean(c, stack);
+				
+			case ast_func_true:
+				return true;
+				
+			case ast_func_false:
+				return false;
+
+			case ast_func_lang:
+			{
+				if (c.n.attribute()) return false;
+				
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_string lang = _left->eval_string(c, stack);
+				
+				for (xml_node n = c.n.node(); n; n = n.parent())
+				{
+					xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
+					
+					if (a)
+					{
+						const char_t* value = a.value();
+						
+						// strnicmp / strncasecmp is not portable
+						for (const char_t* lit = lang.c_str(); *lit; ++lit)
+						{
+							if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
+							++value;
+						}
+						
+						return *value == 0 || *value == '-';
+					}
+				}
+				
+				return false;
+			}
+
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_boolean)
+					return _data.variable->get_boolean();
+
+				// fallthrough to type conversion
+			}
+
+			default:
+			{
+				switch (_rettype)
+				{
+				case xpath_type_number:
+					return convert_number_to_boolean(eval_number(c, stack));
+					
+				case xpath_type_string:
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return !eval_string(c, stack).empty();
+				}
+					
+				case xpath_type_node_set:				
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return !eval_node_set(c, stack).empty();
+				}
+
+				default:
+					assert(!"Wrong expression for return type boolean");
+					return false;
+				}
+			}
+			}
+		}
+
+		double eval_number(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_op_add:
+				return _left->eval_number(c, stack) + _right->eval_number(c, stack);
+				
+			case ast_op_subtract:
+				return _left->eval_number(c, stack) - _right->eval_number(c, stack);
+
+			case ast_op_multiply:
+				return _left->eval_number(c, stack) * _right->eval_number(c, stack);
+
+			case ast_op_divide:
+				return _left->eval_number(c, stack) / _right->eval_number(c, stack);
+
+			case ast_op_mod:
+				return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
+
+			case ast_op_negate:
+				return -_left->eval_number(c, stack);
+
+			case ast_number_constant:
+				return _data.number;
+
+			case ast_func_last:
+				return static_cast<double>(c.size);
+			
+			case ast_func_position:
+				return static_cast<double>(c.position);
+
+			case ast_func_count:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return static_cast<double>(_left->eval_node_set(c, stack).size());
+			}
+			
+			case ast_func_string_length_0:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return static_cast<double>(string_value(c.n, stack.result).length());
+			}
+			
+			case ast_func_string_length_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return static_cast<double>(_left->eval_string(c, stack).length());
+			}
+			
+			case ast_func_number_0:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				return convert_string_to_number(string_value(c.n, stack.result).c_str());
+			}
+			
+			case ast_func_number_1:
+				return _left->eval_number(c, stack);
+
+			case ast_func_sum:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				double r = 0;
+				
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				
+				for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
+				{
+					xpath_allocator_capture cri(stack.result);
+
+					r += convert_string_to_number(string_value(*it, stack.result).c_str());
+				}
+			
+				return r;
+			}
+
+			case ast_func_floor:
+			{
+				double r = _left->eval_number(c, stack);
+				
+				return r == r ? floor(r) : r;
+			}
+
+			case ast_func_ceiling:
+			{
+				double r = _left->eval_number(c, stack);
+				
+				return r == r ? ceil(r) : r;
+			}
+
+			case ast_func_round:
+				return round_nearest_nzero(_left->eval_number(c, stack));
+			
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_number)
+					return _data.variable->get_number();
+
+				// fallthrough to type conversion
+			}
+
+			default:
+			{
+				switch (_rettype)
+				{
+				case xpath_type_boolean:
+					return eval_boolean(c, stack) ? 1 : 0;
+					
+				case xpath_type_string:
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return convert_string_to_number(eval_string(c, stack).c_str());
+				}
+					
+				case xpath_type_node_set:
+				{
+					xpath_allocator_capture cr(stack.result);
+
+					return convert_string_to_number(eval_string(c, stack).c_str());
+				}
+					
+				default:
+					assert(!"Wrong expression for return type number");
+					return 0;
+				}
+				
+			}
+			}
+		}
+		
+		xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
+		{
+			assert(_type == ast_func_concat);
+
+			xpath_allocator_capture ct(stack.temp);
+
+			// count the string number
+			size_t count = 1;
+			for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
+
+			// gather all strings
+			xpath_string static_buffer[4];
+			xpath_string* buffer = static_buffer;
+
+			// allocate on-heap for large concats
+			if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
+			{
+				buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+				assert(buffer);
+			}
+
+			// evaluate all strings to temporary stack
+			xpath_stack swapped_stack = {stack.temp, stack.result};
+
+			buffer[0] = _left->eval_string(c, swapped_stack);
+
+			size_t pos = 1;
+			for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
+			assert(pos == count);
+
+			// get total length
+			size_t length = 0;
+			for (size_t i = 0; i < count; ++i) length += buffer[i].length();
+
+			// create final string
+			char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
+			assert(result);
+
+			char_t* ri = result;
+
+			for (size_t j = 0; j < count; ++j)
+				for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
+					*ri++ = *bi;
+
+			*ri = 0;
+
+			return xpath_string(result, true);
+		}
+
+		xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_string_constant:
+				return xpath_string_const(_data.string);
+			
+			case ast_func_local_name_0:
+			{
+				xpath_node na = c.n;
+				
+				return xpath_string_const(local_name(na));
+			}
+
+			case ast_func_local_name_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				xpath_node na = ns.first();
+				
+				return xpath_string_const(local_name(na));
+			}
+
+			case ast_func_name_0:
+			{
+				xpath_node na = c.n;
+				
+				return xpath_string_const(qualified_name(na));
+			}
+
+			case ast_func_name_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				xpath_node na = ns.first();
+				
+				return xpath_string_const(qualified_name(na));
+			}
+
+			case ast_func_namespace_uri_0:
+			{
+				xpath_node na = c.n;
+				
+				return xpath_string_const(namespace_uri(na));
+			}
+
+			case ast_func_namespace_uri_1:
+			{
+				xpath_allocator_capture cr(stack.result);
+
+				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
+				xpath_node na = ns.first();
+				
+				return xpath_string_const(namespace_uri(na));
+			}
+
+			case ast_func_string_0:
+				return string_value(c.n, stack.result);
+
+			case ast_func_string_1:
+				return _left->eval_string(c, stack);
+
+			case ast_func_concat:
+				return eval_string_concat(c, stack);
+
+			case ast_func_substring_before:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				xpath_string p = _right->eval_string(c, swapped_stack);
+
+				const char_t* pos = find_substring(s.c_str(), p.c_str());
+				
+				return pos ? xpath_string(s.c_str(), pos, stack.result) : xpath_string();
+			}
+			
+			case ast_func_substring_after:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				xpath_string p = _right->eval_string(c, swapped_stack);
+				
+				const char_t* pos = find_substring(s.c_str(), p.c_str());
+				if (!pos) return xpath_string();
+
+				const char_t* result = pos + p.length();
+
+				return s.uses_heap() ? xpath_string(result, stack.result) : xpath_string_const(result);
+			}
+
+			case ast_func_substring_2:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				size_t s_length = s.length();
+
+				double first = round_nearest(_right->eval_number(c, stack));
+				
+				if (is_nan(first)) return xpath_string(); // NaN
+				else if (first >= s_length + 1) return xpath_string();
+				
+				size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+				assert(1 <= pos && pos <= s_length + 1);
+
+				const char_t* rbegin = s.c_str() + (pos - 1);
+				
+				return s.uses_heap() ? xpath_string(rbegin, stack.result) : xpath_string_const(rbegin);
+			}
+			
+			case ast_func_substring_3:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, swapped_stack);
+				size_t s_length = s.length();
+
+				double first = round_nearest(_right->eval_number(c, stack));
+				double last = first + round_nearest(_right->_next->eval_number(c, stack));
+				
+				if (is_nan(first) || is_nan(last)) return xpath_string();
+				else if (first >= s_length + 1) return xpath_string();
+				else if (first >= last) return xpath_string();
+				else if (last < 1) return xpath_string();
+				
+				size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+				size_t end = last >= s_length + 1 ? s_length + 1 : static_cast<size_t>(last);
+
+				assert(1 <= pos && pos <= end && end <= s_length + 1);
+				const char_t* rbegin = s.c_str() + (pos - 1);
+				const char_t* rend = s.c_str() + (end - 1);
+
+				return (end == s_length + 1 && !s.uses_heap()) ? xpath_string_const(rbegin) : xpath_string(rbegin, rend, stack.result);
+			}
+
+			case ast_func_normalize_space_0:
+			{
+				xpath_string s = string_value(c.n, stack.result);
+
+				normalize_space(s.data(stack.result));
+
+				return s;
+			}
+
+			case ast_func_normalize_space_1:
+			{
+				xpath_string s = _left->eval_string(c, stack);
+
+				normalize_space(s.data(stack.result));
+			
+				return s;
+			}
+
+			case ast_func_translate:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_string s = _left->eval_string(c, stack);
+				xpath_string from = _right->eval_string(c, swapped_stack);
+				xpath_string to = _right->_next->eval_string(c, swapped_stack);
+
+				translate(s.data(stack.result), from.c_str(), to.c_str());
+
+				return s;
+			}
+
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_string)
+					return xpath_string_const(_data.variable->get_string());
+
+				// fallthrough to type conversion
+			}
+
+			default:
+			{
+				switch (_rettype)
+				{
+				case xpath_type_boolean:
+					return xpath_string_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+					
+				case xpath_type_number:
+					return convert_number_to_string(eval_number(c, stack), stack.result);
+					
+				case xpath_type_node_set:
+				{
+					xpath_allocator_capture cr(stack.temp);
+
+					xpath_stack swapped_stack = {stack.temp, stack.result};
+
+					xpath_node_set_raw ns = eval_node_set(c, swapped_stack);
+					return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
+				}
+				
+				default:
+					assert(!"Wrong expression for return type string");
+					return xpath_string();
+				}
+			}
+			}
+		}
+
+		xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack)
+		{
+			switch (_type)
+			{
+			case ast_op_union:
+			{
+				xpath_allocator_capture cr(stack.temp);
+
+				xpath_stack swapped_stack = {stack.temp, stack.result};
+
+				xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack);
+				xpath_node_set_raw rs = _right->eval_node_set(c, stack);
+				
+				// we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
+				rs.set_type(xpath_node_set::type_unsorted);
+
+				rs.append(ls.begin(), ls.end(), stack.result);
+				rs.remove_duplicates();
+				
+				return rs;
+			}
+
+			case ast_filter:
+			case ast_filter_posinv:
+			{
+				xpath_node_set_raw set = _left->eval_node_set(c, stack);
+
+				// either expression is a number or it contains position() call; sort by document order
+				if (_type == ast_filter) set.sort_do();
+
+				apply_predicate(set, 0, _right, stack);
+			
+				return set;
+			}
+			
+			case ast_func_id:
+				return xpath_node_set_raw();
+			
+			case ast_step:
+			{
+				switch (_axis)
+				{
+				case axis_ancestor:
+					return step_do(c, stack, axis_to_type<axis_ancestor>());
+					
+				case axis_ancestor_or_self:
+					return step_do(c, stack, axis_to_type<axis_ancestor_or_self>());
+
+				case axis_attribute:
+					return step_do(c, stack, axis_to_type<axis_attribute>());
+
+				case axis_child:
+					return step_do(c, stack, axis_to_type<axis_child>());
+				
+				case axis_descendant:
+					return step_do(c, stack, axis_to_type<axis_descendant>());
+
+				case axis_descendant_or_self:
+					return step_do(c, stack, axis_to_type<axis_descendant_or_self>());
+
+				case axis_following:
+					return step_do(c, stack, axis_to_type<axis_following>());
+				
+				case axis_following_sibling:
+					return step_do(c, stack, axis_to_type<axis_following_sibling>());
+				
+				case axis_namespace:
+					// namespaced axis is not supported
+					return xpath_node_set_raw();
+				
+				case axis_parent:
+					return step_do(c, stack, axis_to_type<axis_parent>());
+				
+				case axis_preceding:
+					return step_do(c, stack, axis_to_type<axis_preceding>());
+
+				case axis_preceding_sibling:
+					return step_do(c, stack, axis_to_type<axis_preceding_sibling>());
+				
+				case axis_self:
+					return step_do(c, stack, axis_to_type<axis_self>());
+
+				default:
+					assert(!"Unknown axis");
+					return xpath_node_set_raw();
+				}
+			}
+
+			case ast_step_root:
+			{
+				assert(!_right); // root step can't have any predicates
+
+				xpath_node_set_raw ns;
+
+				ns.set_type(xpath_node_set::type_sorted);
+
+				if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
+				else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
+
+				return ns;
+			}
+
+			case ast_variable:
+			{
+				assert(_rettype == _data.variable->type());
+
+				if (_rettype == xpath_type_node_set)
+				{
+					const xpath_node_set& s = _data.variable->get_node_set();
+
+					xpath_node_set_raw ns;
+
+					ns.set_type(s.type());
+					ns.append(s.begin(), s.end(), stack.result);
+
+					return ns;
+				}
+
+				// fallthrough to type conversion
+			}
+
+			default:
+				assert(!"Wrong expression for return type node set");
+				return xpath_node_set_raw();
+			}
+		}
+		
+		bool is_posinv()
+		{
+			switch (_type)
+			{
+			case ast_func_position:
+				return false;
+
+			case ast_string_constant:
+			case ast_number_constant:
+			case ast_variable:
+				return true;
+
+			case ast_step:
+			case ast_step_root:
+				return true;
+
+			case ast_predicate:
+			case ast_filter:
+			case ast_filter_posinv:
+				return true;
+
+			default:
+				if (_left && !_left->is_posinv()) return false;
+				
+				for (xpath_ast_node* n = _right; n; n = n->_next)
+					if (!n->is_posinv()) return false;
+					
+				return true;
+			}
+		}
+
+		xpath_value_type rettype() const
+		{
+			return static_cast<xpath_value_type>(_rettype);
+		}
+	};
+
+	struct xpath_parser
+	{
+		xpath_allocator* _alloc;
+		xpath_lexer _lexer;
+
+		const char_t* _query;
+		xpath_variable_set* _variables;
+
+		xpath_parse_result* _result;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		jmp_buf _error_handler;
+	#endif
+
+		void throw_error(const char* message)
+		{
+			_result->error = message;
+			_result->offset = _lexer.current_pos() - _query;
+
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			longjmp(_error_handler, 1);
+		#else
+			throw xpath_exception(*_result);
+		#endif
+		}
+
+		void throw_error_oom()
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			throw_error("Out of memory");
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+
+		void* alloc_node()
+		{
+			void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+
+			if (!result) throw_error_oom();
+
+			return result;
+		}
+
+		const char_t* alloc_string(const xpath_lexer_string& value)
+		{
+			if (value.begin)
+			{
+				size_t length = static_cast<size_t>(value.end - value.begin);
+
+				char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
+				if (!c) throw_error_oom();
+
+				memcpy(c, value.begin, length * sizeof(char_t));
+				c[length] = 0;
+
+				return c;
+			}
+			else return 0;
+		}
+
+		xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
+		{
+			assert(argc <= 1);
+
+			if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+
+			return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+		}
+
+		xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
+		{
+			switch (name.begin[0])
+			{
+			case 'b':
+				if (name == PUGIXML_TEXT("boolean") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+					
+				break;
+			
+			case 'c':
+				if (name == PUGIXML_TEXT("count") && argc == 1)
+				{
+					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+					return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+				}
+				else if (name == PUGIXML_TEXT("contains") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("concat") && argc >= 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+					
+				break;
+			
+			case 'f':
+				if (name == PUGIXML_TEXT("false") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+				else if (name == PUGIXML_TEXT("floor") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+					
+				break;
+			
+			case 'i':
+				if (name == PUGIXML_TEXT("id") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+					
+				break;
+			
+			case 'l':
+				if (name == PUGIXML_TEXT("last") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+				else if (name == PUGIXML_TEXT("lang") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+				else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
+					return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+			
+				break;
+			
+			case 'n':
+				if (name == PUGIXML_TEXT("name") && argc <= 1)
+					return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+				else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
+					return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+				else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("not") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+				else if (name == PUGIXML_TEXT("number") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+			
+				break;
+			
+			case 'p':
+				if (name == PUGIXML_TEXT("position") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+				
+				break;
+			
+			case 'r':
+				if (name == PUGIXML_TEXT("round") && argc == 1)
+					return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+
+				break;
+			
+			case 's':
+				if (name == PUGIXML_TEXT("string") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+				else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
+					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_string, args[0]);
+				else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
+					return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
+					return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("sum") && argc == 1)
+				{
+					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+					return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+				}
+
+				break;
+			
+			case 't':
+				if (name == PUGIXML_TEXT("translate") && argc == 3)
+					return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+				else if (name == PUGIXML_TEXT("true") && argc == 0)
+					return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+					
+				break;
+
+			default:
+				break;
+			}
+
+			throw_error("Unrecognized function or wrong parameter count");
+
+			return 0;
+		}
+
+		axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
+		{
+			specified = true;
+
+			switch (name.begin[0])
+			{
+			case 'a':
+				if (name == PUGIXML_TEXT("ancestor"))
+					return axis_ancestor;
+				else if (name == PUGIXML_TEXT("ancestor-or-self"))
+					return axis_ancestor_or_self;
+				else if (name == PUGIXML_TEXT("attribute"))
+					return axis_attribute;
+				
+				break;
+			
+			case 'c':
+				if (name == PUGIXML_TEXT("child"))
+					return axis_child;
+				
+				break;
+			
+			case 'd':
+				if (name == PUGIXML_TEXT("descendant"))
+					return axis_descendant;
+				else if (name == PUGIXML_TEXT("descendant-or-self"))
+					return axis_descendant_or_self;
+				
+				break;
+			
+			case 'f':
+				if (name == PUGIXML_TEXT("following"))
+					return axis_following;
+				else if (name == PUGIXML_TEXT("following-sibling"))
+					return axis_following_sibling;
+				
+				break;
+			
+			case 'n':
+				if (name == PUGIXML_TEXT("namespace"))
+					return axis_namespace;
+				
+				break;
+			
+			case 'p':
+				if (name == PUGIXML_TEXT("parent"))
+					return axis_parent;
+				else if (name == PUGIXML_TEXT("preceding"))
+					return axis_preceding;
+				else if (name == PUGIXML_TEXT("preceding-sibling"))
+					return axis_preceding_sibling;
+				
+				break;
+			
+			case 's':
+				if (name == PUGIXML_TEXT("self"))
+					return axis_self;
+				
+				break;
+
+			default:
+				break;
+			}
+
+			specified = false;
+			return axis_child;
+		}
+
+		nodetest_t parse_node_test_type(const xpath_lexer_string& name)
+		{
+			switch (name.begin[0])
+			{
+			case 'c':
+				if (name == PUGIXML_TEXT("comment"))
+					return nodetest_type_comment;
+
+				break;
+
+			case 'n':
+				if (name == PUGIXML_TEXT("node"))
+					return nodetest_type_node;
+
+				break;
+
+			case 'p':
+				if (name == PUGIXML_TEXT("processing-instruction"))
+					return nodetest_type_pi;
+
+				break;
+
+			case 't':
+				if (name == PUGIXML_TEXT("text"))
+					return nodetest_type_text;
+
+				break;
+			
+			default:
+				break;
+			}
+
+			return nodetest_none;
+		}
+
+		// PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
+		xpath_ast_node* parse_primary_expression()
+		{
+			switch (_lexer.current())
+			{
+			case lex_var_ref:
+			{
+				xpath_lexer_string name = _lexer.contents();
+
+				if (!_variables)
+					throw_error("Unknown variable: variable set is not provided");
+
+				xpath_variable* var = get_variable(_variables, name.begin, name.end);
+
+				if (!var)
+					throw_error("Unknown variable: variable set does not contain the given name");
+
+				_lexer.next();
+
+				return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+			}
+
+			case lex_open_brace:
+			{
+				_lexer.next();
+
+				xpath_ast_node* n = parse_expression();
+
+				if (_lexer.current() != lex_close_brace)
+					throw_error("Unmatched braces");
+
+				_lexer.next();
+
+				return n;
+			}
+
+			case lex_quoted_string:
+			{
+				const char_t* value = alloc_string(_lexer.contents());
+
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
+				_lexer.next();
+
+				return n;
+			}
+
+			case lex_number:
+			{
+				double value = 0;
+
+				if (!convert_string_to_number(_lexer.contents().begin, _lexer.contents().end, &value))
+					throw_error_oom();
+
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
+				_lexer.next();
+
+				return n;
+			}
+
+			case lex_string:
+			{
+				xpath_ast_node* args[2] = {0};
+				size_t argc = 0;
+				
+				xpath_lexer_string function = _lexer.contents();
+				_lexer.next();
+				
+				xpath_ast_node* last_arg = 0;
+				
+				if (_lexer.current() != lex_open_brace)
+					throw_error("Unrecognized function call");
+				_lexer.next();
+
+				if (_lexer.current() != lex_close_brace)
+					args[argc++] = parse_expression();
+
+				while (_lexer.current() != lex_close_brace)
+				{
+					if (_lexer.current() != lex_comma)
+						throw_error("No comma between function arguments");
+					_lexer.next();
+					
+					xpath_ast_node* n = parse_expression();
+					
+					if (argc < 2) args[argc] = n;
+					else last_arg->set_next(n);
+
+					argc++;
+					last_arg = n;
+				}
+				
+				_lexer.next();
+
+				return parse_function(function, argc, args);
+			}
+
+			default:
+				throw_error("Unrecognizable primary expression");
+
+				return 0;
+			}
+		}
+		
+		// FilterExpr ::= PrimaryExpr | FilterExpr Predicate
+		// Predicate ::= '[' PredicateExpr ']'
+		// PredicateExpr ::= Expr
+		xpath_ast_node* parse_filter_expression()
+		{
+			xpath_ast_node* n = parse_primary_expression();
+
+			while (_lexer.current() == lex_open_square_brace)
+			{
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_expression();
+
+				if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
+
+				bool posinv = expr->rettype() != xpath_type_number && expr->is_posinv();
+
+				n = new (alloc_node()) xpath_ast_node(posinv ? ast_filter_posinv : ast_filter, xpath_type_node_set, n, expr);
+
+				if (_lexer.current() != lex_close_square_brace)
+					throw_error("Unmatched square brace");
+			
+				_lexer.next();
+			}
+			
+			return n;
+		}
+		
+		// Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
+		// AxisSpecifier ::= AxisName '::' | '@'?
+		// NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
+		// NameTest ::= '*' | NCName ':' '*' | QName
+		// AbbreviatedStep ::= '.' | '..'
+		xpath_ast_node* parse_step(xpath_ast_node* set)
+		{
+			if (set && set->rettype() != xpath_type_node_set)
+				throw_error("Step has to be applied to node set");
+
+			bool axis_specified = false;
+			axis_t axis = axis_child; // implied child axis
+
+			if (_lexer.current() == lex_axis_attribute)
+			{
+				axis = axis_attribute;
+				axis_specified = true;
+				
+				_lexer.next();
+			}
+			else if (_lexer.current() == lex_dot)
+			{
+				_lexer.next();
+				
+				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+			}
+			else if (_lexer.current() == lex_double_dot)
+			{
+				_lexer.next();
+				
+				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+			}
+		
+			nodetest_t nt_type = nodetest_none;
+			xpath_lexer_string nt_name;
+			
+			if (_lexer.current() == lex_string)
+			{
+				// node name test
+				nt_name = _lexer.contents();
+				_lexer.next();
+
+				// was it an axis name?
+				if (_lexer.current() == lex_double_colon)
+				{
+					// parse axis name
+					if (axis_specified) throw_error("Two axis specifiers in one step");
+
+					axis = parse_axis_name(nt_name, axis_specified);
+
+					if (!axis_specified) throw_error("Unknown axis");
+
+					// read actual node test
+					_lexer.next();
+
+					if (_lexer.current() == lex_multiply)
+					{
+						nt_type = nodetest_all;
+						nt_name = xpath_lexer_string();
+						_lexer.next();
+					}
+					else if (_lexer.current() == lex_string)
+					{
+						nt_name = _lexer.contents();
+						_lexer.next();
+					}
+					else throw_error("Unrecognized node test");
+				}
+				
+				if (nt_type == nodetest_none)
+				{
+					// node type test or processing-instruction
+					if (_lexer.current() == lex_open_brace)
+					{
+						_lexer.next();
+						
+						if (_lexer.current() == lex_close_brace)
+						{
+							_lexer.next();
+
+							nt_type = parse_node_test_type(nt_name);
+
+							if (nt_type == nodetest_none) throw_error("Unrecognized node type");
+							
+							nt_name = xpath_lexer_string();
+						}
+						else if (nt_name == PUGIXML_TEXT("processing-instruction"))
+						{
+							if (_lexer.current() != lex_quoted_string)
+								throw_error("Only literals are allowed as arguments to processing-instruction()");
+						
+							nt_type = nodetest_pi;
+							nt_name = _lexer.contents();
+							_lexer.next();
+							
+							if (_lexer.current() != lex_close_brace)
+								throw_error("Unmatched brace near processing-instruction()");
+							_lexer.next();
+						}
+						else
+							throw_error("Unmatched brace near node type test");
+
+					}
+					// QName or NCName:*
+					else
+					{
+						if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
+						{
+							nt_name.end--; // erase *
+							
+							nt_type = nodetest_all_in_namespace;
+						}
+						else nt_type = nodetest_name;
+					}
+				}
+			}
+			else if (_lexer.current() == lex_multiply)
+			{
+				nt_type = nodetest_all;
+				_lexer.next();
+			}
+			else throw_error("Unrecognized node test");
+			
+			xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
+			
+			xpath_ast_node* last = 0;
+			
+			while (_lexer.current() == lex_open_square_brace)
+			{
+				_lexer.next();
+				
+				xpath_ast_node* expr = parse_expression();
+
+				xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, xpath_type_node_set, expr);
+				
+				if (_lexer.current() != lex_close_square_brace)
+					throw_error("Unmatched square brace");
+				_lexer.next();
+				
+				if (last) last->set_next(pred);
+				else n->set_right(pred);
+				
+				last = pred;
+			}
+			
+			return n;
+		}
+		
+		// RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
+		xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
+		{
+			xpath_ast_node* n = parse_step(set);
+			
+			while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+			{
+				lexeme_t l = _lexer.current();
+				_lexer.next();
+
+				if (l == lex_double_slash)
+					n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				
+				n = parse_step(n);
+			}
+			
+			return n;
+		}
+		
+		// LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
+		// AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
+		xpath_ast_node* parse_location_path()
+		{
+			if (_lexer.current() == lex_slash)
+			{
+				_lexer.next();
+				
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+
+				// relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
+				lexeme_t l = _lexer.current();
+
+				if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
+					return parse_relative_location_path(n);
+				else
+					return n;
+			}
+			else if (_lexer.current() == lex_double_slash)
+			{
+				_lexer.next();
+				
+				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+				n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				
+				return parse_relative_location_path(n);
+			}
+
+			// else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
+			return parse_relative_location_path(0);
+		}
+		
+		// PathExpr ::= LocationPath
+		//				| FilterExpr
+		//				| FilterExpr '/' RelativeLocationPath
+		//				| FilterExpr '//' RelativeLocationPath
+		xpath_ast_node* parse_path_expression()
+		{
+			// Clarification.
+			// PathExpr begins with either LocationPath or FilterExpr.
+			// FilterExpr begins with PrimaryExpr
+			// PrimaryExpr begins with '$' in case of it being a variable reference,
+			// '(' in case of it being an expression, string literal, number constant or
+			// function call.
+
+			if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || 
+				_lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
+				_lexer.current() == lex_string)
+			{
+				if (_lexer.current() == lex_string)
+				{
+					// This is either a function call, or not - if not, we shall proceed with location path
+					const char_t* state = _lexer.state();
+					
+					while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
+					
+					if (*state != '(') return parse_location_path();
+
+					// This looks like a function call; however this still can be a node-test. Check it.
+					if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
+				}
+				
+				xpath_ast_node* n = parse_filter_expression();
+
+				if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+				{
+					lexeme_t l = _lexer.current();
+					_lexer.next();
+					
+					if (l == lex_double_slash)
+					{
+						if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
+
+						n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+					}
+	
+					// select from location path
+					return parse_relative_location_path(n);
+				}
+
+				return n;
+			}
+			else return parse_location_path();
+		}
+
+		// UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
+		xpath_ast_node* parse_union_expression()
+		{
+			xpath_ast_node* n = parse_path_expression();
+
+			while (_lexer.current() == lex_union)
+			{
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_union_expression();
+
+				if (n->rettype() != xpath_type_node_set || expr->rettype() != xpath_type_node_set)
+					throw_error("Union operator has to be applied to node sets");
+
+				n = new (alloc_node()) xpath_ast_node(ast_op_union, xpath_type_node_set, n, expr);
+			}
+
+			return n;
+		}
+
+		// UnaryExpr ::= UnionExpr | '-' UnaryExpr
+		xpath_ast_node* parse_unary_expression()
+		{
+			if (_lexer.current() == lex_minus)
+			{
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_unary_expression();
+
+				return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+			}
+			else return parse_union_expression();
+		}
+		
+		// MultiplicativeExpr ::= UnaryExpr
+		//						  | MultiplicativeExpr '*' UnaryExpr
+		//						  | MultiplicativeExpr 'div' UnaryExpr
+		//						  | MultiplicativeExpr 'mod' UnaryExpr
+		xpath_ast_node* parse_multiplicative_expression()
+		{
+			xpath_ast_node* n = parse_unary_expression();
+
+			while (_lexer.current() == lex_multiply || (_lexer.current() == lex_string &&
+				   (_lexer.contents() == PUGIXML_TEXT("mod") || _lexer.contents() == PUGIXML_TEXT("div"))))
+			{
+				ast_type_t op = _lexer.current() == lex_multiply ? ast_op_multiply :
+					_lexer.contents().begin[0] == 'd' ? ast_op_divide : ast_op_mod;
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_unary_expression();
+
+				n = new (alloc_node()) xpath_ast_node(op, xpath_type_number, n, expr);
+			}
+
+			return n;
+		}
+
+		// AdditiveExpr ::= MultiplicativeExpr
+		//					| AdditiveExpr '+' MultiplicativeExpr
+		//					| AdditiveExpr '-' MultiplicativeExpr
+		xpath_ast_node* parse_additive_expression()
+		{
+			xpath_ast_node* n = parse_multiplicative_expression();
+
+			while (_lexer.current() == lex_plus || _lexer.current() == lex_minus)
+			{
+				lexeme_t l = _lexer.current();
+
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_multiplicative_expression();
+
+				n = new (alloc_node()) xpath_ast_node(l == lex_plus ? ast_op_add : ast_op_subtract, xpath_type_number, n, expr);
+			}
+
+			return n;
+		}
+
+		// RelationalExpr ::= AdditiveExpr
+		//					  | RelationalExpr '<' AdditiveExpr
+		//					  | RelationalExpr '>' AdditiveExpr
+		//					  | RelationalExpr '<=' AdditiveExpr
+		//					  | RelationalExpr '>=' AdditiveExpr
+		xpath_ast_node* parse_relational_expression()
+		{
+			xpath_ast_node* n = parse_additive_expression();
+
+			while (_lexer.current() == lex_less || _lexer.current() == lex_less_or_equal || 
+				   _lexer.current() == lex_greater || _lexer.current() == lex_greater_or_equal)
+			{
+				lexeme_t l = _lexer.current();
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_additive_expression();
+
+				n = new (alloc_node()) xpath_ast_node(l == lex_less ? ast_op_less : l == lex_greater ? ast_op_greater :
+								l == lex_less_or_equal ? ast_op_less_or_equal : ast_op_greater_or_equal, xpath_type_boolean, n, expr);
+			}
+
+			return n;
+		}
+		
+		// EqualityExpr ::= RelationalExpr
+		//					| EqualityExpr '=' RelationalExpr
+		//					| EqualityExpr '!=' RelationalExpr
+		xpath_ast_node* parse_equality_expression()
+		{
+			xpath_ast_node* n = parse_relational_expression();
+
+			while (_lexer.current() == lex_equal || _lexer.current() == lex_not_equal)
+			{
+				lexeme_t l = _lexer.current();
+
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_relational_expression();
+
+				n = new (alloc_node()) xpath_ast_node(l == lex_equal ? ast_op_equal : ast_op_not_equal, xpath_type_boolean, n, expr);
+			}
+
+			return n;
+		}
+		
+		// AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
+		xpath_ast_node* parse_and_expression()
+		{
+			xpath_ast_node* n = parse_equality_expression();
+
+			while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("and"))
+			{
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_equality_expression();
+
+				n = new (alloc_node()) xpath_ast_node(ast_op_and, xpath_type_boolean, n, expr);
+			}
+
+			return n;
+		}
+
+		// OrExpr ::= AndExpr | OrExpr 'or' AndExpr
+		xpath_ast_node* parse_or_expression()
+		{
+			xpath_ast_node* n = parse_and_expression();
+
+			while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("or"))
+			{
+				_lexer.next();
+
+				xpath_ast_node* expr = parse_and_expression();
+
+				n = new (alloc_node()) xpath_ast_node(ast_op_or, xpath_type_boolean, n, expr);
+			}
+
+			return n;
+		}
+		
+		// Expr ::= OrExpr
+		xpath_ast_node* parse_expression()
+		{
+			return parse_or_expression();
+		}
+
+		xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
+		{
+		}
+
+		xpath_ast_node* parse()
+		{
+			xpath_ast_node* result = parse_expression();
+			
+			if (_lexer.current() != lex_eof)
+			{
+				// there are still unparsed tokens left, error
+				throw_error("Incorrect query");
+			}
+			
+			return result;
+		}
+
+		static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
+		{
+			xpath_parser parser(query, variables, alloc, result);
+
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			int error = setjmp(parser._error_handler);
+
+			return (error == 0) ? parser.parse() : 0;
+		#else
+			return parser.parse();
+		#endif
+		}
+	};
+
+	struct xpath_query_impl
+	{
+		static xpath_query_impl* create()
+		{
+			void* memory = xml_memory::allocate(sizeof(xpath_query_impl));
+
+			return new (memory) xpath_query_impl();
+		}
+
+		static void destroy(void* ptr)
+		{
+			if (!ptr) return;
+			
+			// free all allocated pages
+			static_cast<xpath_query_impl*>(ptr)->alloc.release();
+
+			// free allocator memory (with the first page)
+			xml_memory::deallocate(ptr);
+		}
+
+		xpath_query_impl(): root(0), alloc(&block)
+		{
+			block.next = 0;
+		}
+
+		xpath_ast_node* root;
+		xpath_allocator alloc;
+		xpath_memory_block block;
+	};
+
+	PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
+	{
+		if (!impl) return xpath_string();
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return xpath_string();
+	#endif
+
+		xpath_context c(n, 1, 1);
+
+		return impl->root->eval_string(c, sd.stack);
+	}
+PUGI__NS_END
+
+namespace pugi
+{
+#ifndef PUGIXML_NO_EXCEPTIONS
+	PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_)
+	{
+		assert(_result.error);
+	}
+	
+	PUGI__FN const char* xpath_exception::what() const throw()
+	{
+		return _result.error;
+	}
+
+	PUGI__FN const xpath_parse_result& xpath_exception::result() const
+	{
+		return _result;
+	}
+#endif
+	
+	PUGI__FN xpath_node::xpath_node()
+	{
+	}
+		
+	PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_)
+	{
+	}
+		
+	PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_)
+	{
+	}
+
+	PUGI__FN xml_node xpath_node::node() const
+	{
+		return _attribute ? xml_node() : _node;
+	}
+		
+	PUGI__FN xml_attribute xpath_node::attribute() const
+	{
+		return _attribute;
+	}
+	
+	PUGI__FN xml_node xpath_node::parent() const
+	{
+		return _attribute ? _node : _node.parent();
+	}
+
+	PUGI__FN static void unspecified_bool_xpath_node(xpath_node***)
+	{
+	}
+
+	PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const
+	{
+		return (_node || _attribute) ? unspecified_bool_xpath_node : 0;
+	}
+	
+	PUGI__FN bool xpath_node::operator!() const
+	{
+		return !(_node || _attribute);
+	}
+
+	PUGI__FN bool xpath_node::operator==(const xpath_node& n) const
+	{
+		return _node == n._node && _attribute == n._attribute;
+	}
+	
+	PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const
+	{
+		return _node != n._node || _attribute != n._attribute;
+	}
+
+#ifdef __BORLANDC__
+	PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs)
+	{
+		return (bool)lhs && rhs;
+	}
+
+	PUGI__FN bool operator||(const xpath_node& lhs, bool rhs)
+	{
+		return (bool)lhs || rhs;
+	}
+#endif
+
+	PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_)
+	{
+		assert(begin_ <= end_);
+
+		size_t size_ = static_cast<size_t>(end_ - begin_);
+
+		if (size_ <= 1)
+		{
+			// deallocate old buffer
+			if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+			// use internal buffer
+			if (begin_ != end_) _storage = *begin_;
+
+			_begin = &_storage;
+			_end = &_storage + size_;
+		}
+		else
+		{
+			// make heap copy
+			xpath_node* storage = static_cast<xpath_node*>(impl::xml_memory::allocate(size_ * sizeof(xpath_node)));
+
+			if (!storage)
+			{
+			#ifdef PUGIXML_NO_EXCEPTIONS
+				return;
+			#else
+				throw std::bad_alloc();
+			#endif
+			}
+
+			memcpy(storage, begin_, size_ * sizeof(xpath_node));
+			
+			// deallocate old buffer
+			if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+			// finalize
+			_begin = storage;
+			_end = storage + size_;
+		}
+	}
+
+	PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+	{
+	}
+
+	PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage)
+	{
+		_assign(begin_, end_);
+	}
+
+	PUGI__FN xpath_node_set::~xpath_node_set()
+	{
+		if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+	}
+		
+	PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
+	{
+		_assign(ns._begin, ns._end);
+	}
+	
+	PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
+	{
+		if (this == &ns) return *this;
+		
+		_type = ns._type;
+		_assign(ns._begin, ns._end);
+
+		return *this;
+	}
+
+	PUGI__FN xpath_node_set::type_t xpath_node_set::type() const
+	{
+		return _type;
+	}
+		
+	PUGI__FN size_t xpath_node_set::size() const
+	{
+		return _end - _begin;
+	}
+		
+	PUGI__FN bool xpath_node_set::empty() const
+	{
+		return _begin == _end;
+	}
+		
+	PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const
+	{
+		assert(index < size());
+		return _begin[index];
+	}
+
+	PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const
+	{
+		return _begin;
+	}
+		
+	PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const
+	{
+		return _end;
+	}
+	
+	PUGI__FN void xpath_node_set::sort(bool reverse)
+	{
+		_type = impl::xpath_sort(_begin, _end, _type, reverse);
+	}
+
+	PUGI__FN xpath_node xpath_node_set::first() const
+	{
+		return impl::xpath_first(_begin, _end, _type);
+	}
+
+	PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
+	{
+	}
+
+	PUGI__FN xpath_parse_result::operator bool() const
+	{
+		return error == 0;
+	}
+
+	PUGI__FN const char* xpath_parse_result::description() const
+	{
+		return error ? error : "No error";
+	}
+
+	PUGI__FN xpath_variable::xpath_variable()
+	{
+	}
+
+	PUGI__FN const char_t* xpath_variable::name() const
+	{
+		switch (_type)
+		{
+		case xpath_type_node_set:
+			return static_cast<const impl::xpath_variable_node_set*>(this)->name;
+
+		case xpath_type_number:
+			return static_cast<const impl::xpath_variable_number*>(this)->name;
+
+		case xpath_type_string:
+			return static_cast<const impl::xpath_variable_string*>(this)->name;
+
+		case xpath_type_boolean:
+			return static_cast<const impl::xpath_variable_boolean*>(this)->name;
+
+		default:
+			assert(!"Invalid variable type");
+			return 0;
+		}
+	}
+
+	PUGI__FN xpath_value_type xpath_variable::type() const
+	{
+		return _type;
+	}
+
+	PUGI__FN bool xpath_variable::get_boolean() const
+	{
+		return (_type == xpath_type_boolean) ? static_cast<const impl::xpath_variable_boolean*>(this)->value : false;
+	}
+
+	PUGI__FN double xpath_variable::get_number() const
+	{
+		return (_type == xpath_type_number) ? static_cast<const impl::xpath_variable_number*>(this)->value : impl::gen_nan();
+	}
+
+	PUGI__FN const char_t* xpath_variable::get_string() const
+	{
+		const char_t* value = (_type == xpath_type_string) ? static_cast<const impl::xpath_variable_string*>(this)->value : 0;
+		return value ? value : PUGIXML_TEXT("");
+	}
+
+	PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const
+	{
+		return (_type == xpath_type_node_set) ? static_cast<const impl::xpath_variable_node_set*>(this)->value : impl::dummy_node_set;
+	}
+
+	PUGI__FN bool xpath_variable::set(bool value)
+	{
+		if (_type != xpath_type_boolean) return false;
+
+		static_cast<impl::xpath_variable_boolean*>(this)->value = value;
+		return true;
+	}
+
+	PUGI__FN bool xpath_variable::set(double value)
+	{
+		if (_type != xpath_type_number) return false;
+
+		static_cast<impl::xpath_variable_number*>(this)->value = value;
+		return true;
+	}
+
+	PUGI__FN bool xpath_variable::set(const char_t* value)
+	{
+		if (_type != xpath_type_string) return false;
+
+		impl::xpath_variable_string* var = static_cast<impl::xpath_variable_string*>(this);
+
+		// duplicate string
+		size_t size = (impl::strlength(value) + 1) * sizeof(char_t);
+
+		char_t* copy = static_cast<char_t*>(impl::xml_memory::allocate(size));
+		if (!copy) return false;
+
+		memcpy(copy, value, size);
+
+		// replace old string
+		if (var->value) impl::xml_memory::deallocate(var->value);
+		var->value = copy;
+
+		return true;
+	}
+
+	PUGI__FN bool xpath_variable::set(const xpath_node_set& value)
+	{
+		if (_type != xpath_type_node_set) return false;
+
+		static_cast<impl::xpath_variable_node_set*>(this)->value = value;
+		return true;
+	}
+
+	PUGI__FN xpath_variable_set::xpath_variable_set()
+	{
+		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
+	}
+
+	PUGI__FN xpath_variable_set::~xpath_variable_set()
+	{
+		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
+		{
+			xpath_variable* var = _data[i];
+
+			while (var)
+			{
+				xpath_variable* next = var->_next;
+
+				impl::delete_xpath_variable(var->_type, var);
+
+				var = next;
+			}
+		}
+	}
+
+	PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const
+	{
+		const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+		size_t hash = impl::hash_string(name) % hash_size;
+
+		// look for existing variable
+		for (xpath_variable* var = _data[hash]; var; var = var->_next)
+			if (impl::strequal(var->name(), name))
+				return var;
+
+		return 0;
+	}
+
+	PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
+	{
+		const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+		size_t hash = impl::hash_string(name) % hash_size;
+
+		// look for existing variable
+		for (xpath_variable* var = _data[hash]; var; var = var->_next)
+			if (impl::strequal(var->name(), name))
+				return var->type() == type ? var : 0;
+
+		// add new variable
+		xpath_variable* result = impl::new_xpath_variable(type, name);
+
+		if (result)
+		{
+			result->_type = type;
+			result->_next = _data[hash];
+
+			_data[hash] = result;
+		}
+
+		return result;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value)
+	{
+		xpath_variable* var = add(name, xpath_type_boolean);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, double value)
+	{
+		xpath_variable* var = add(name, xpath_type_number);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value)
+	{
+		xpath_variable* var = add(name, xpath_type_string);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
+	{
+		xpath_variable* var = add(name, xpath_type_node_set);
+		return var ? var->set(value) : false;
+	}
+
+	PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name)
+	{
+		return find(name);
+	}
+
+	PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const
+	{
+		return find(name);
+	}
+
+	PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
+	{
+		impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create();
+
+		if (!qimpl)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			_result.error = "Out of memory";
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+		else
+		{
+			impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy);
+
+			qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result);
+
+			if (qimpl->root)
+			{
+				_impl = static_cast<impl::xpath_query_impl*>(impl_holder.release());
+				_result.error = 0;
+			}
+		}
+	}
+
+	PUGI__FN xpath_query::~xpath_query()
+	{
+		impl::xpath_query_impl::destroy(_impl);
+	}
+
+	PUGI__FN xpath_value_type xpath_query::return_type() const
+	{
+		if (!_impl) return xpath_type_none;
+
+		return static_cast<impl::xpath_query_impl*>(_impl)->root->rettype();
+	}
+
+	PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const
+	{
+		if (!_impl) return false;
+		
+		impl::xpath_context c(n, 1, 1);
+		impl::xpath_stack_data sd;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return false;
+	#endif
+		
+		return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+	}
+	
+	PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
+	{
+		if (!_impl) return impl::gen_nan();
+		
+		impl::xpath_context c(n, 1, 1);
+		impl::xpath_stack_data sd;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return impl::gen_nan();
+	#endif
+
+		return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+	}
+
+#ifndef PUGIXML_NO_STL
+	PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
+	{
+		impl::xpath_stack_data sd;
+
+		return impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd).c_str();
+	}
+#endif
+
+	PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
+	{
+		impl::xpath_stack_data sd;
+
+		impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+		size_t full_size = r.length() + 1;
+		
+		if (capacity > 0)
+		{
+			size_t size = (full_size < capacity) ? full_size : capacity;
+			assert(size > 0);
+
+			memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
+			buffer[size - 1] = 0;
+		}
+		
+		return full_size;
+	}
+
+	PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
+	{
+		if (!_impl) return xpath_node_set();
+
+		impl::xpath_ast_node* root = static_cast<impl::xpath_query_impl*>(_impl)->root;
+
+		if (root->rettype() != xpath_type_node_set)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return xpath_node_set();
+		#else
+			xpath_parse_result res;
+			res.error = "Expression does not evaluate to node set";
+
+			throw xpath_exception(res);
+		#endif
+		}
+		
+		impl::xpath_context c(n, 1, 1);
+		impl::xpath_stack_data sd;
+
+	#ifdef PUGIXML_NO_EXCEPTIONS
+		if (setjmp(sd.error_handler)) return xpath_node_set();
+	#endif
+
+		impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack);
+
+		return xpath_node_set(r.begin(), r.end(), r.type());
+	}
+
+	PUGI__FN const xpath_parse_result& xpath_query::result() const
+	{
+		return _result;
+	}
+
+	PUGI__FN static void unspecified_bool_xpath_query(xpath_query***)
+	{
+	}
+
+	PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const
+	{
+		return _impl ? unspecified_bool_xpath_query : 0;
+	}
+
+	PUGI__FN bool xpath_query::operator!() const
+	{
+		return !_impl;
+	}
+
+	PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
+	{
+		xpath_query q(query, variables);
+		return select_single_node(q);
+	}
+
+	PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
+	{
+		xpath_node_set s = query.evaluate_node_set(*this);
+		return s.empty() ? xpath_node() : s.first();
+	}
+
+	PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
+	{
+		xpath_query q(query, variables);
+		return select_nodes(q);
+	}
+
+	PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
+	{
+		return query.evaluate_node_set(*this);
+	}
+}
+
+#endif
+
+#ifdef __BORLANDC__
+#	pragma option pop
+#endif
+
+// Intel C++ does not properly keep warning state for function templates,
+// so popping warning state at the end of translation unit leads to warnings in the middle.
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#	pragma warning(pop)
+#endif
+
+// Undefine all local macros (makes sure we're not leaking macros in header-only mode)
+#undef PUGI__NO_INLINE
+#undef PUGI__STATIC_ASSERT
+#undef PUGI__DMC_VOLATILE
+#undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__NS_BEGIN
+#undef PUGI__NS_END
+#undef PUGI__FN
+#undef PUGI__FN_NO_INLINE
+#undef PUGI__IS_CHARTYPE_IMPL
+#undef PUGI__IS_CHARTYPE
+#undef PUGI__IS_CHARTYPEX
+#undef PUGI__SKIPWS
+#undef PUGI__OPTSET
+#undef PUGI__PUSHNODE
+#undef PUGI__POPNODE
+#undef PUGI__SCANFOR
+#undef PUGI__SCANWHILE
+#undef PUGI__ENDSEG
+#undef PUGI__THROW_ERROR
+#undef PUGI__CHECK_ERROR
+
+#endif
+
+/**
+ * Copyright (c) 2006-2012 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/gadgets/octave/pugixml.hpp b/gadgets/octave/pugixml.hpp
new file mode 100644
index 0000000..77b4dcf
--- /dev/null
+++ b/gadgets/octave/pugixml.hpp
@@ -0,0 +1,1265 @@
+/**
+ * pugixml parser - version 1.2
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
+ */
+
+#ifndef PUGIXML_VERSION
+// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
+#	define PUGIXML_VERSION 120
+#endif
+
+// Include user configuration file (this can define various configuration macros)
+#include "pugiconfig.hpp"
+
+#ifndef HEADER_PUGIXML_HPP
+#define HEADER_PUGIXML_HPP
+
+// Include stddef.h for size_t and ptrdiff_t
+#include <stddef.h>
+
+// Include exception header for XPath
+#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
+#	include <exception>
+#endif
+
+// Include STL headers
+#ifndef PUGIXML_NO_STL
+#	include <iterator>
+#	include <iosfwd>
+#	include <string>
+#endif
+
+// Macro for deprecated features
+#ifndef PUGIXML_DEPRECATED
+#	if defined(__GNUC__)
+#		define PUGIXML_DEPRECATED __attribute__((deprecated))
+#	elif defined(_MSC_VER) && _MSC_VER >= 1300
+#		define PUGIXML_DEPRECATED __declspec(deprecated)
+#	else
+#		define PUGIXML_DEPRECATED
+#	endif
+#endif
+
+// If no API is defined, assume default
+#ifndef PUGIXML_API
+#	define PUGIXML_API
+#endif
+
+// If no API for classes is defined, assume default
+#ifndef PUGIXML_CLASS
+#	define PUGIXML_CLASS PUGIXML_API
+#endif
+
+// If no API for functions is defined, assume default
+#ifndef PUGIXML_FUNCTION
+#	define PUGIXML_FUNCTION PUGIXML_API
+#endif
+
+// Character interface macros
+#ifdef PUGIXML_WCHAR_MODE
+#	define PUGIXML_TEXT(t) L ## t
+#	define PUGIXML_CHAR wchar_t
+#else
+#	define PUGIXML_TEXT(t) t
+#	define PUGIXML_CHAR char
+#endif
+
+namespace pugi
+{
+	// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+	typedef PUGIXML_CHAR char_t;
+
+#ifndef PUGIXML_NO_STL
+	// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+	typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+#endif
+}
+
+// The PugiXML namespace
+namespace pugi
+{
+	// Tree node types
+	enum xml_node_type
+	{
+		node_null,			// Empty (null) node handle
+		node_document,		// A document tree's absolute root
+		node_element,		// Element tag, i.e. '<node/>'
+		node_pcdata,		// Plain character data, i.e. 'text'
+		node_cdata,			// Character data, i.e. '<![CDATA[text]]>'
+		node_comment,		// Comment tag, i.e. '<!-- text -->'
+		node_pi,			// Processing instruction, i.e. '<?name?>'
+		node_declaration,	// Document declaration, i.e. '<?xml version="1.0"?>'
+		node_doctype		// Document type declaration, i.e. '<!DOCTYPE doc>'
+	};
+
+	// Parsing options
+
+	// Minimal parsing mode (equivalent to turning all other flags off).
+	// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+	const unsigned int parse_minimal = 0x0000;
+
+	// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+	const unsigned int parse_pi = 0x0001;
+
+	// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+	const unsigned int parse_comments = 0x0002;
+
+	// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+	const unsigned int parse_cdata = 0x0004;
+
+	// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+	// This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+	const unsigned int parse_ws_pcdata = 0x0008;
+
+	// This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+	const unsigned int parse_escapes = 0x0010;
+
+	// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+	const unsigned int parse_eol = 0x0020;
+	
+	// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+	const unsigned int parse_wconv_attribute = 0x0040;
+
+	// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+	const unsigned int parse_wnorm_attribute = 0x0080;
+	
+	// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+	const unsigned int parse_declaration = 0x0100;
+
+	// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+	const unsigned int parse_doctype = 0x0200;
+
+	// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+	// of whitespace is added to the DOM tree.
+	// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+	const unsigned int parse_ws_pcdata_single = 0x0400;
+
+	// The default parsing mode.
+	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+	const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+	// The full parsing mode.
+	// Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+	const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+	// These flags determine the encoding of input data for XML document
+	enum xml_encoding
+	{
+		encoding_auto,		// Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+		encoding_utf8,		// UTF8 encoding
+		encoding_utf16_le,	// Little-endian UTF16
+		encoding_utf16_be,	// Big-endian UTF16
+		encoding_utf16,		// UTF16 with native endianness
+		encoding_utf32_le,	// Little-endian UTF32
+		encoding_utf32_be,	// Big-endian UTF32
+		encoding_utf32,		// UTF32 with native endianness
+		encoding_wchar,		// The same encoding wchar_t has (either UTF16 or UTF32)
+		encoding_latin1
+	};
+
+	// Formatting flags
+	
+	// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+	const unsigned int format_indent = 0x01;
+	
+	// Write encoding-specific BOM to the output stream. This flag is off by default.
+	const unsigned int format_write_bom = 0x02;
+
+	// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+	const unsigned int format_raw = 0x04;
+	
+	// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+	const unsigned int format_no_declaration = 0x08;
+
+	// Don't escape attribute values and PCDATA contents. This flag is off by default.
+	const unsigned int format_no_escapes = 0x10;
+
+	// Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+	const unsigned int format_save_file_text = 0x20;
+
+	// The default set of formatting flags.
+	// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+	const unsigned int format_default = format_indent;
+		
+	// Forward declarations
+	struct xml_attribute_struct;
+	struct xml_node_struct;
+
+	class xml_node_iterator;
+	class xml_attribute_iterator;
+	class xml_named_node_iterator;
+
+	class xml_tree_walker;
+
+	class xml_node;
+
+	class xml_text;
+	
+	#ifndef PUGIXML_NO_XPATH
+	class xpath_node;
+	class xpath_node_set;
+	class xpath_query;
+	class xpath_variable_set;
+	#endif
+
+	// Range-based for loop support
+	template <typename It> class xml_object_range
+	{
+	public:
+		typedef It const_iterator;
+
+		xml_object_range(It b, It e): _begin(b), _end(e)
+		{
+		}
+
+		It begin() const { return _begin; }
+		It end() const { return _end; }
+
+	private:
+		It _begin, _end;
+	};
+
+	// Writer interface for node printing (see xml_node::print)
+	class PUGIXML_CLASS xml_writer
+	{
+	public:
+		virtual ~xml_writer() {}
+
+		// Write memory chunk into stream/file/whatever
+		virtual void write(const void* data, size_t size) = 0;
+	};
+
+	// xml_writer implementation for FILE*
+	class PUGIXML_CLASS xml_writer_file: public xml_writer
+	{
+	public:
+		// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+		xml_writer_file(void* file);
+
+		virtual void write(const void* data, size_t size);
+
+	private:
+		void* file;
+	};
+
+	#ifndef PUGIXML_NO_STL
+	// xml_writer implementation for streams
+	class PUGIXML_CLASS xml_writer_stream: public xml_writer
+	{
+	public:
+		// Construct writer from an output stream object
+		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+		virtual void write(const void* data, size_t size);
+
+	private:
+		std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+		std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+	};
+	#endif
+
+	// A light-weight handle for manipulating attributes in DOM tree
+	class PUGIXML_CLASS xml_attribute
+	{
+		friend class xml_attribute_iterator;
+		friend class xml_node;
+
+	private:
+		xml_attribute_struct* _attr;
+	
+		typedef void (*unspecified_bool_type)(xml_attribute***);
+
+	public:
+		// Default constructor. Constructs an empty attribute.
+		xml_attribute();
+		
+		// Constructs attribute from internal pointer
+		explicit xml_attribute(xml_attribute_struct* attr);
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+
+		// Comparison operators (compares wrapped attribute pointers)
+		bool operator==(const xml_attribute& r) const;
+		bool operator!=(const xml_attribute& r) const;
+		bool operator<(const xml_attribute& r) const;
+		bool operator>(const xml_attribute& r) const;
+		bool operator<=(const xml_attribute& r) const;
+		bool operator>=(const xml_attribute& r) const;
+
+		// Check if attribute is empty
+		bool empty() const;
+
+		// Get attribute name/value, or "" if attribute is empty
+		const char_t* name() const;
+		const char_t* value() const;
+
+		// Get attribute value, or the default value if attribute is empty
+		const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+		// Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+		int as_int(int def = 0) const;
+		unsigned int as_uint(unsigned int def = 0) const;
+		double as_double(double def = 0) const;
+		float as_float(float def = 0) const;
+
+		// Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+		bool as_bool(bool def = false) const;
+
+		// Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+		bool set_name(const char_t* rhs);
+		bool set_value(const char_t* rhs);
+
+		// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+		bool set_value(int rhs);
+		bool set_value(unsigned int rhs);
+		bool set_value(double rhs);
+		bool set_value(bool rhs);
+
+		// Set attribute value (equivalent to set_value without error checking)
+		xml_attribute& operator=(const char_t* rhs);
+		xml_attribute& operator=(int rhs);
+		xml_attribute& operator=(unsigned int rhs);
+		xml_attribute& operator=(double rhs);
+		xml_attribute& operator=(bool rhs);
+
+		// Get next/previous attribute in the attribute list of the parent node
+		xml_attribute next_attribute() const;
+		xml_attribute previous_attribute() const;
+
+		// Get hash value (unique for handles to the same object)
+		size_t hash_value() const;
+
+		// Get internal pointer
+		xml_attribute_struct* internal_object() const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+#endif
+
+	// A light-weight handle for manipulating nodes in DOM tree
+	class PUGIXML_CLASS xml_node
+	{
+		friend class xml_attribute_iterator;
+		friend class xml_node_iterator;
+		friend class xml_named_node_iterator;
+
+	protected:
+		xml_node_struct* _root;
+
+		typedef void (*unspecified_bool_type)(xml_node***);
+
+	public:
+		// Default constructor. Constructs an empty node.
+		xml_node();
+
+		// Constructs node from internal pointer
+		explicit xml_node(xml_node_struct* p);
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+	
+		// Comparison operators (compares wrapped node pointers)
+		bool operator==(const xml_node& r) const;
+		bool operator!=(const xml_node& r) const;
+		bool operator<(const xml_node& r) const;
+		bool operator>(const xml_node& r) const;
+		bool operator<=(const xml_node& r) const;
+		bool operator>=(const xml_node& r) const;
+
+		// Check if node is empty.
+		bool empty() const;
+
+		// Get node type
+		xml_node_type type() const;
+
+		// Get node name/value, or "" if node is empty or it has no name/value
+		const char_t* name() const;
+		const char_t* value() const;
+	
+		// Get attribute list
+		xml_attribute first_attribute() const;
+		xml_attribute last_attribute() const;
+
+		// Get children list
+		xml_node first_child() const;
+		xml_node last_child() const;
+
+		// Get next/previous sibling in the children list of the parent node
+		xml_node next_sibling() const;
+		xml_node previous_sibling() const;
+		
+		// Get parent node
+		xml_node parent() const;
+
+		// Get root of DOM tree this node belongs to
+		xml_node root() const;
+
+		// Get text object for the current node
+		xml_text text() const;
+
+		// Get child, attribute or next/previous sibling with the specified name
+		xml_node child(const char_t* name) const;
+		xml_attribute attribute(const char_t* name) const;
+		xml_node next_sibling(const char_t* name) const;
+		xml_node previous_sibling(const char_t* name) const;
+
+		// Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+		const char_t* child_value() const;
+
+		// Get child value of child with specified name. Equivalent to child(name).child_value().
+		const char_t* child_value(const char_t* name) const;
+
+		// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+		bool set_name(const char_t* rhs);
+		bool set_value(const char_t* rhs);
+		
+		// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+		xml_attribute append_attribute(const char_t* name);
+		xml_attribute prepend_attribute(const char_t* name);
+		xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+		xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+		// Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+		xml_attribute append_copy(const xml_attribute& proto);
+		xml_attribute prepend_copy(const xml_attribute& proto);
+		xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+		xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+		// Add child node with specified type. Returns added node, or empty node on errors.
+		xml_node append_child(xml_node_type type = node_element);
+		xml_node prepend_child(xml_node_type type = node_element);
+		xml_node insert_child_after(xml_node_type type, const xml_node& node);
+		xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+		// Add child element with specified name. Returns added node, or empty node on errors.
+		xml_node append_child(const char_t* name);
+		xml_node prepend_child(const char_t* name);
+		xml_node insert_child_after(const char_t* name, const xml_node& node);
+		xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+		// Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+		xml_node append_copy(const xml_node& proto);
+		xml_node prepend_copy(const xml_node& proto);
+		xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+		xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+		// Remove specified attribute
+		bool remove_attribute(const xml_attribute& a);
+		bool remove_attribute(const char_t* name);
+
+		// Remove specified child
+		bool remove_child(const xml_node& n);
+		bool remove_child(const char_t* name);
+
+		// Find attribute using predicate. Returns first attribute for which predicate returned true.
+		template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
+		{
+			if (!_root) return xml_attribute();
+			
+			for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+				if (pred(attrib))
+					return attrib;
+		
+			return xml_attribute();
+		}
+
+		// Find child node using predicate. Returns first child for which predicate returned true.
+		template <typename Predicate> xml_node find_child(Predicate pred) const
+		{
+			if (!_root) return xml_node();
+	
+			for (xml_node node = first_child(); node; node = node.next_sibling())
+				if (pred(node))
+					return node;
+		
+			return xml_node();
+		}
+
+		// Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+		template <typename Predicate> xml_node find_node(Predicate pred) const
+		{
+			if (!_root) return xml_node();
+
+			xml_node cur = first_child();
+			
+			while (cur._root && cur._root != _root)
+			{
+				if (pred(cur)) return cur;
+
+				if (cur.first_child()) cur = cur.first_child();
+				else if (cur.next_sibling()) cur = cur.next_sibling();
+				else
+				{
+					while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+					if (cur._root != _root) cur = cur.next_sibling();
+				}
+			}
+
+			return xml_node();
+		}
+
+		// Find child node by attribute name/value
+		xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+		xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+	#ifndef PUGIXML_NO_STL
+		// Get the absolute node path from root as a text string.
+		string_t path(char_t delimiter = '/') const;
+	#endif
+
+		// Search for a node by path consisting of node names and . or .. elements.
+		xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+		// Recursively traverse subtree with xml_tree_walker
+		bool traverse(xml_tree_walker& walker);
+	
+	#ifndef PUGIXML_NO_XPATH
+		// Select single node by evaluating XPath query. Returns first node from the resulting node set.
+		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+		xpath_node select_single_node(const xpath_query& query) const;
+
+		// Select node set by evaluating XPath query
+		xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+		xpath_node_set select_nodes(const xpath_query& query) const;
+	#endif
+		
+		// Print subtree using a writer object
+		void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+	#ifndef PUGIXML_NO_STL
+		// Print subtree to stream
+		void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+		void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+	#endif
+
+		// Child nodes iterators
+		typedef xml_node_iterator iterator;
+
+		iterator begin() const;
+		iterator end() const;
+
+		// Attribute iterators
+		typedef xml_attribute_iterator attribute_iterator;
+
+		attribute_iterator attributes_begin() const;
+		attribute_iterator attributes_end() const;
+
+		// Range-based for support
+		xml_object_range<xml_node_iterator> children() const;
+		xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+		xml_object_range<xml_attribute_iterator> attributes() const;
+
+		// Get node offset in parsed file/string (in char_t units) for debugging purposes
+		ptrdiff_t offset_debug() const;
+
+		// Get hash value (unique for handles to the same object)
+		size_t hash_value() const;
+
+		// Get internal pointer
+		xml_node_struct* internal_object() const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+#endif
+
+	// A helper for working with text inside PCDATA nodes
+	class PUGIXML_CLASS xml_text
+	{
+		friend class xml_node;
+
+		xml_node_struct* _root;
+
+		typedef void (*unspecified_bool_type)(xml_text***);
+
+		explicit xml_text(xml_node_struct* root);
+
+		xml_node_struct* _data_new();
+		xml_node_struct* _data() const;
+
+	public:
+		// Default constructor. Constructs an empty object.
+		xml_text();
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+
+		// Check if text object is empty
+		bool empty() const;
+
+		// Get text, or "" if object is empty
+		const char_t* get() const;
+
+		// Get text, or the default value if object is empty
+		const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+		// Get text as a number, or the default value if conversion did not succeed or object is empty
+		int as_int(int def = 0) const;
+		unsigned int as_uint(unsigned int def = 0) const;
+		double as_double(double def = 0) const;
+		float as_float(float def = 0) const;
+
+		// Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+		bool as_bool(bool def = false) const;
+
+		// Set text (returns false if object is empty or there is not enough memory)
+		bool set(const char_t* rhs);
+
+		// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+		bool set(int rhs);
+		bool set(unsigned int rhs);
+		bool set(double rhs);
+		bool set(bool rhs);
+
+		// Set text (equivalent to set without error checking)
+		xml_text& operator=(const char_t* rhs);
+		xml_text& operator=(int rhs);
+		xml_text& operator=(unsigned int rhs);
+		xml_text& operator=(double rhs);
+		xml_text& operator=(bool rhs);
+
+		// Get the data node (node_pcdata or node_cdata) for this object
+		xml_node data() const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+#endif
+
+	// Child node iterator (a bidirectional iterator over a collection of xml_node)
+	class PUGIXML_CLASS xml_node_iterator
+	{
+		friend class xml_node;
+
+	private:
+		mutable xml_node _wrap;
+		xml_node _parent;
+
+		xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+
+	public:
+		// Iterator traits
+		typedef ptrdiff_t difference_type;
+		typedef xml_node value_type;
+		typedef xml_node* pointer;
+		typedef xml_node& reference;
+
+	#ifndef PUGIXML_NO_STL
+		typedef std::bidirectional_iterator_tag iterator_category;
+	#endif
+
+		// Default constructor
+		xml_node_iterator();
+
+		// Construct an iterator which points to the specified node
+		xml_node_iterator(const xml_node& node);
+
+		// Iterator operators
+		bool operator==(const xml_node_iterator& rhs) const;
+		bool operator!=(const xml_node_iterator& rhs) const;
+
+		xml_node& operator*() const;
+		xml_node* operator->() const;
+
+		const xml_node_iterator& operator++();
+		xml_node_iterator operator++(int);
+
+		const xml_node_iterator& operator--();
+		xml_node_iterator operator--(int);
+	};
+
+	// Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+	class PUGIXML_CLASS xml_attribute_iterator
+	{
+		friend class xml_node;
+
+	private:
+		mutable xml_attribute _wrap;
+		xml_node _parent;
+
+		xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
+
+	public:
+		// Iterator traits
+		typedef ptrdiff_t difference_type;
+		typedef xml_attribute value_type;
+		typedef xml_attribute* pointer;
+		typedef xml_attribute& reference;
+
+	#ifndef PUGIXML_NO_STL
+		typedef std::bidirectional_iterator_tag iterator_category;
+	#endif
+
+		// Default constructor
+		xml_attribute_iterator();
+
+		// Construct an iterator which points to the specified attribute
+		xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+		// Iterator operators
+		bool operator==(const xml_attribute_iterator& rhs) const;
+		bool operator!=(const xml_attribute_iterator& rhs) const;
+
+		xml_attribute& operator*() const;
+		xml_attribute* operator->() const;
+
+		const xml_attribute_iterator& operator++();
+		xml_attribute_iterator operator++(int);
+
+		const xml_attribute_iterator& operator--();
+		xml_attribute_iterator operator--(int);
+	};
+
+	// Named node range helper
+	class xml_named_node_iterator
+	{
+	public:
+		// Iterator traits
+		typedef ptrdiff_t difference_type;
+		typedef xml_node value_type;
+		typedef xml_node* pointer;
+		typedef xml_node& reference;
+
+	#ifndef PUGIXML_NO_STL
+		typedef std::forward_iterator_tag iterator_category;
+	#endif
+
+		// Default constructor
+		xml_named_node_iterator();
+
+		// Construct an iterator which points to the specified node
+		xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+		// Iterator operators
+		bool operator==(const xml_named_node_iterator& rhs) const;
+		bool operator!=(const xml_named_node_iterator& rhs) const;
+
+		xml_node& operator*() const;
+		xml_node* operator->() const;
+
+		const xml_named_node_iterator& operator++();
+		xml_named_node_iterator operator++(int);
+
+	private:
+		mutable xml_node _node;
+		const char_t* _name;
+	};
+
+	// Abstract tree walker class (see xml_node::traverse)
+	class PUGIXML_CLASS xml_tree_walker
+	{
+		friend class xml_node;
+
+	private:
+		int _depth;
+	
+	protected:
+		// Get current traversal depth
+		int depth() const;
+	
+	public:
+		xml_tree_walker();
+		virtual ~xml_tree_walker();
+
+		// Callback that is called when traversal begins
+		virtual bool begin(xml_node& node);
+
+		// Callback that is called for each node traversed
+		virtual bool for_each(xml_node& node) = 0;
+
+		// Callback that is called when traversal ends
+		virtual bool end(xml_node& node);
+	};
+
+	// Parsing status, returned as part of xml_parse_result object
+	enum xml_parse_status
+	{
+		status_ok = 0,				// No error
+
+		status_file_not_found,		// File was not found during load_file()
+		status_io_error,			// Error reading from file/stream
+		status_out_of_memory,		// Could not allocate memory
+		status_internal_error,		// Internal error occurred
+
+		status_unrecognized_tag,	// Parser could not determine tag type
+
+		status_bad_pi,				// Parsing error occurred while parsing document declaration/processing instruction
+		status_bad_comment,			// Parsing error occurred while parsing comment
+		status_bad_cdata,			// Parsing error occurred while parsing CDATA section
+		status_bad_doctype,			// Parsing error occurred while parsing document type declaration
+		status_bad_pcdata,			// Parsing error occurred while parsing PCDATA section
+		status_bad_start_element,	// Parsing error occurred while parsing start element tag
+		status_bad_attribute,		// Parsing error occurred while parsing element attribute
+		status_bad_end_element,		// Parsing error occurred while parsing end element tag
+		status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+	};
+
+	// Parsing result
+	struct PUGIXML_CLASS xml_parse_result
+	{
+		// Parsing status (see xml_parse_status)
+		xml_parse_status status;
+
+		// Last parsed offset (in char_t units from start of input data)
+		ptrdiff_t offset;
+
+		// Source document encoding
+		xml_encoding encoding;
+
+		// Default constructor, initializes object to failed state
+		xml_parse_result();
+
+		// Cast to bool operator
+		operator bool() const;
+
+		// Get error description
+		const char* description() const;
+	};
+
+	// Document class (DOM tree root)
+	class PUGIXML_CLASS xml_document: public xml_node
+	{
+	private:
+		char_t* _buffer;
+
+		char _memory[192];
+		
+		// Non-copyable semantics
+		xml_document(const xml_document&);
+		const xml_document& operator=(const xml_document&);
+
+		void create();
+		void destroy();
+
+		xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own);
+
+	public:
+		// Default constructor, makes empty document
+		xml_document();
+
+		// Destructor, invalidates all node/attribute handles to this document
+		~xml_document();
+
+		// Removes all nodes, leaving the empty document
+		void reset();
+
+		// Removes all nodes, then copies the entire contents of the specified document
+		void reset(const xml_document& proto);
+
+	#ifndef PUGIXML_NO_STL
+		// Load document from stream.
+		xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+		xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+	#endif
+
+		// Load document from zero-terminated string. No encoding conversions are applied.
+		xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+		// Load document from file
+		xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+		xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+		xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+		// You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+		xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+		// You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+		xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+		// Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+		void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+	#ifndef PUGIXML_NO_STL
+		// Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+		void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+		void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+	#endif
+
+		// Save XML to file
+		bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+		bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+		// Get document element
+		xml_node document_element() const;
+	};
+
+#ifndef PUGIXML_NO_XPATH
+	// XPath query return type
+	enum xpath_value_type
+	{
+		xpath_type_none,	  // Unknown type (query failed to compile)
+		xpath_type_node_set,  // Node set (xpath_node_set)
+		xpath_type_number,	  // Number
+		xpath_type_string,	  // String
+		xpath_type_boolean	  // Boolean
+	};
+
+	// XPath parsing result
+	struct PUGIXML_CLASS xpath_parse_result
+	{
+		// Error message (0 if no error)
+		const char* error;
+
+		// Last parsed offset (in char_t units from string start)
+		ptrdiff_t offset;
+
+		// Default constructor, initializes object to failed state
+		xpath_parse_result();
+
+		// Cast to bool operator
+		operator bool() const;
+
+		// Get error description
+		const char* description() const;
+	};
+
+	// A single XPath variable
+	class PUGIXML_CLASS xpath_variable
+	{
+		friend class xpath_variable_set;
+
+	protected:
+		xpath_value_type _type;
+		xpath_variable* _next;
+
+		xpath_variable();
+
+		// Non-copyable semantics
+		xpath_variable(const xpath_variable&);
+		xpath_variable& operator=(const xpath_variable&);
+		
+	public:
+		// Get variable name
+		const char_t* name() const;
+
+		// Get variable type
+		xpath_value_type type() const;
+
+		// Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+		bool get_boolean() const;
+		double get_number() const;
+		const char_t* get_string() const;
+		const xpath_node_set& get_node_set() const;
+
+		// Set variable value; no type conversion is performed, false is returned on type mismatch error
+		bool set(bool value);
+		bool set(double value);
+		bool set(const char_t* value);
+		bool set(const xpath_node_set& value);
+	};
+
+	// A set of XPath variables
+	class PUGIXML_CLASS xpath_variable_set
+	{
+	private:
+		xpath_variable* _data[64];
+
+		// Non-copyable semantics
+		xpath_variable_set(const xpath_variable_set&);
+		xpath_variable_set& operator=(const xpath_variable_set&);
+
+		xpath_variable* find(const char_t* name) const;
+
+	public:
+		// Default constructor/destructor
+		xpath_variable_set();
+		~xpath_variable_set();
+
+		// Add a new variable or get the existing one, if the types match
+		xpath_variable* add(const char_t* name, xpath_value_type type);
+
+		// Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+		bool set(const char_t* name, bool value);
+		bool set(const char_t* name, double value);
+		bool set(const char_t* name, const char_t* value);
+		bool set(const char_t* name, const xpath_node_set& value);
+
+		// Get existing variable by name
+		xpath_variable* get(const char_t* name);
+		const xpath_variable* get(const char_t* name) const;
+	};
+
+	// A compiled XPath query object
+	class PUGIXML_CLASS xpath_query
+	{
+	private:
+		void* _impl;
+		xpath_parse_result _result;
+
+		typedef void (*unspecified_bool_type)(xpath_query***);
+
+		// Non-copyable semantics
+		xpath_query(const xpath_query&);
+		xpath_query& operator=(const xpath_query&);
+
+	public:
+		// Construct a compiled object from XPath expression.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+		explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+		// Destructor
+		~xpath_query();
+
+		// Get query expression return type
+		xpath_value_type return_type() const;
+		
+		// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		bool evaluate_boolean(const xpath_node& n) const;
+		
+		// Evaluate expression as double value in the specified context; performs type conversion if necessary.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		double evaluate_number(const xpath_node& n) const;
+		
+	#ifndef PUGIXML_NO_STL
+		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		string_t evaluate_string(const xpath_node& n) const;
+	#endif
+		
+		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
+		// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+		// If PUGIXML_NO_EXCEPTIONS is defined, returns empty  set instead.
+		size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+		// Evaluate expression as node set in the specified context.
+		// If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+		// If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+		xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+		// Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+		const xpath_parse_result& result() const;
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+
+		// Borland C++ workaround
+		bool operator!() const;
+	};
+	
+	#ifndef PUGIXML_NO_EXCEPTIONS
+	// XPath exception class
+	class PUGIXML_CLASS xpath_exception: public std::exception
+	{
+	private:
+		xpath_parse_result _result;
+
+	public:
+		// Construct exception from parse result
+		explicit xpath_exception(const xpath_parse_result& result);
+
+		// Get error message
+		virtual const char* what() const throw();
+
+		// Get parse result
+		const xpath_parse_result& result() const;
+	};
+	#endif
+	
+	// XPath node class (either xml_node or xml_attribute)
+	class PUGIXML_CLASS xpath_node
+	{
+	private:
+		xml_node _node;
+		xml_attribute _attribute;
+	
+		typedef void (*unspecified_bool_type)(xpath_node***);
+
+	public:
+		// Default constructor; constructs empty XPath node
+		xpath_node();
+		
+		// Construct XPath node from XML node/attribute
+		xpath_node(const xml_node& node);
+		xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+		// Get node/attribute, if any
+		xml_node node() const;
+		xml_attribute attribute() const;
+		
+		// Get parent of contained node/attribute
+		xml_node parent() const;
+
+		// Safe bool conversion operator
+		operator unspecified_bool_type() const;
+		
+		// Borland C++ workaround
+		bool operator!() const;
+
+		// Comparison operators
+		bool operator==(const xpath_node& n) const;
+		bool operator!=(const xpath_node& n) const;
+	};
+
+#ifdef __BORLANDC__
+	// Borland C++ workaround
+	bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+	bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+#endif
+
+	// A fixed-size collection of XPath nodes
+	class PUGIXML_CLASS xpath_node_set
+	{
+	public:
+		// Collection type
+		enum type_t
+		{
+			type_unsorted,			// Not ordered
+			type_sorted,			// Sorted by document order (ascending)
+			type_sorted_reverse		// Sorted by document order (descending)
+		};
+		
+		// Constant iterator type
+		typedef const xpath_node* const_iterator;
+	
+		// Default constructor. Constructs empty set.
+		xpath_node_set();
+
+		// Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+		xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+		// Destructor
+		~xpath_node_set();
+		
+		// Copy constructor/assignment operator
+		xpath_node_set(const xpath_node_set& ns);
+		xpath_node_set& operator=(const xpath_node_set& ns);
+
+		// Get collection type
+		type_t type() const;
+		
+		// Get collection size
+		size_t size() const;
+
+		// Indexing operator
+		const xpath_node& operator[](size_t index) const;
+		
+		// Collection iterators
+		const_iterator begin() const;
+		const_iterator end() const;
+
+		// Sort the collection in ascending/descending order by document order
+		void sort(bool reverse = false);
+		
+		// Get first node in the collection by document order
+		xpath_node first() const;
+		
+		// Check if collection is empty
+		bool empty() const;
+	
+	private:
+		type_t _type;
+		
+		xpath_node _storage;
+		
+		xpath_node* _begin;
+		xpath_node* _end;
+
+		void _assign(const_iterator begin, const_iterator end);
+	};
+#endif
+
+#ifndef PUGIXML_NO_STL
+	// Convert wide string to UTF8
+	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+	
+	// Convert UTF8 to wide string
+	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+#endif
+
+	// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+	typedef void* (*allocation_function)(size_t size);
+	
+	// Memory deallocation function interface
+	typedef void (*deallocation_function)(void* ptr);
+
+	// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+	void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+	
+	// Get current memory management functions
+	allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+	deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+	std::forward_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+	// Workarounds for (non-standard) iterator category detection
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+	std::forward_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#endif
+
+/**
+ * Copyright (c) 2006-2012 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/gadgets/python/CMakeLists.txt b/gadgets/python/CMakeLists.txt
new file mode 100644
index 0000000..0273d22
--- /dev/null
+++ b/gadgets/python/CMakeLists.txt
@@ -0,0 +1,61 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_PYTHON__)
+ENDIF (WIN32)
+
+# TODO: The Gadgetron Python code uses Numpy C-API code
+# from Numpy versions < 1.7. If Numpy version is >= 1.7
+# you will get compiler warnings
+#add_definitions(-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${PYTHON_INCLUDE_PATH}
+  ${NUMPY_INCLUDE_DIRS}
+  )
+
+add_library(GadgetronPythonMRI MODULE GadgetronPythonMRI.cpp GadgetReference.cpp)
+
+add_library(gadgetron_python SHARED       		
+	PythonGadget.cpp
+	GadgetReference.cpp
+	GadgetronPythonMRI.cpp
+	PythonCommunicator.cpp)
+
+target_link_libraries(gadgetron_python
+	${ISMRMRD_LIBRARIES} 
+	optimized ${ACE_LIBRARIES}  
+	debug ${ACE_DEBUG_LIBRARY} 
+	${PYTHON_LIBRARIES}
+	${Boost_LIBRARIES}
+    ${MKL_LIBRARIES})
+	
+target_link_libraries(GadgetronPythonMRI
+	optimized ${ACE_LIBRARIES}  
+	debug ${ACE_DEBUG_LIBRARY} 
+	${PYTHON_LIBRARIES} 
+	${Boost_LIBRARIES}
+    ${MKL_LIBRARIES})
+
+IF (WIN32)
+    SET_TARGET_PROPERTIES(GadgetronPythonMRI PROPERTIES SUFFIX .pyd)
+	SET_TARGET_PROPERTIES(gadgetron_python PROPERTIES LINK_FLAGS "/LIBPATH:${PYTHON_INCLUDE_DIR}/../libs" )
+ENDIF(WIN32)
+
+set_target_properties(GadgetronPythonMRI PROPERTIES PREFIX "")
+	
+install(TARGETS gadgetron_python GadgetronPythonMRI DESTINATION lib)
+
+install(FILES 
+	rms_coil_combine.py
+	kspaceandimage.py
+	remove_2x_oversampling.py
+	accumulate_and_recon.py
+	GadgetronXML.py
+	image_viewer.py 
+DESTINATION lib)
+
+install(FILES
+	python.xml python_short.xml
+DESTINATION config)
diff --git a/gadgets/python/GadgetReference.cpp b/gadgets/python/GadgetReference.cpp
new file mode 100644
index 0000000..217d152
--- /dev/null
+++ b/gadgets/python/GadgetReference.cpp
@@ -0,0 +1,96 @@
+#include "Gadgetron.h"
+#include "Gadget.h"
+#include "GadgetReference.h"
+#include "GadgetContainerMessage.h"
+#include "hoNDArray.h"
+#include "ismrmrd.h"
+#include <boost/preprocessor/stringize.hpp>
+#include <boost/python.hpp>
+#include <numpy/numpyconfig.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
+
+#include <complex>
+
+namespace Gadgetron{
+
+  GadgetReference::GadgetReference()
+    : gadget_(0)
+  {
+    //_import_array();
+  }
+
+  GadgetReference::~GadgetReference()
+  {
+
+  }
+
+  template<class T>
+  int GadgetReference::return_data(T header, boost::python::object arr)
+  {
+
+    PyArrayObject* arrPtr = PyArray_GETCONTIGUOUS((PyArrayObject*)arr.ptr());//PyArray_FromObject(arr.ptr(),NPY_COMPLEX64,1,5); //So.... this is probably really really really bad.
+    int ndims = PyArray_NDIM(arrPtr);
+    npy_intp* dims = PyArray_DIMS(arrPtr);
+    std::vector<size_t> dimensions(ndims);
+    for (int i = 0; i < ndims; i++) dimensions[ndims-i-1] = static_cast<unsigned int>(dims[i]);
+
+    GadgetContainerMessage< T >*         m1 = new GadgetContainerMessage< T >;
+    memcpy(m1->getObjectPtr(), &header, sizeof(T));
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2 = new GadgetContainerMessage< hoNDArray< std::complex<float> > >;
+    m1->cont(m2);
+
+    try{m2->getObjectPtr()->create(&dimensions);}
+    catch (std::runtime_error &err){
+      GADGET_DEBUG_EXCEPTION(err,"Failed to create data storage for data returning from Python");
+      return GADGET_FAIL;
+    
+    }
+
+    memcpy(m2->getObjectPtr()->get_data_ptr(), PyArray_DATA(arrPtr), m2->getObjectPtr()->get_number_of_elements()*sizeof(std::complex<float>));
+
+    if (gadget_) {
+      //ACE_Time_Value wait = ACE_OS::gettimeofday() + ACE_Time_Value(0,1000); //1ms from now
+      ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+      //GADGET_DEBUG2("Returning data (%s)\n", gadget_->module()->name());
+      if (gadget_->next()->putq(m1,&nowait) == -1) {
+	m1->release();
+	//if (gadget_->next()->putq(m1) == -1) {
+	/*
+	  GADGET_DEBUG2("Putting message on Queue failed (%s)\n", gadget_->module()->name());
+	  GADGET_DEBUG2("Message Q: low mark %d, high mark %d, message bytes %d, message count %d\n",
+	  gadget_->next()->msg_queue()->low_water_mark(), gadget_->next()->msg_queue()->high_water_mark(),
+	  gadget_->next()->msg_queue()->message_bytes(),gadget_->next()->msg_queue()->message_count());
+	*/
+	//GADGET_DEBUG2("FAIL Returning data (%s)\n", gadget_->module()->name());
+	return GADGET_FAIL;
+      } else {
+	//GADGET_DEBUG2("SUCCESS Returning data (%s)\n", gadget_->module()->name());
+
+	return GADGET_OK;
+      }
+      //return gadget_->next()->putq(m1);
+    } else {
+      GADGET_DEBUG1("Data received from python, but no Gadget registered for output\n");
+      m1->release();
+      return GADGET_OK;
+    }
+
+    return GADGET_OK;
+
+  }
+
+  int GadgetReference::return_acquisition(ISMRMRD::AcquisitionHeader acq, boost::python::object arr)
+  {
+    return return_data<ISMRMRD::AcquisitionHeader>(acq, arr);
+  }
+
+  int GadgetReference::return_image(ISMRMRD::ImageHeader img, boost::python::object arr)
+  {
+    return return_data<ISMRMRD::ImageHeader>(img, arr);
+  }
+
+  template int GadgetReference::return_data<ISMRMRD::AcquisitionHeader>(ISMRMRD::AcquisitionHeader, boost::python::object);
+  template int GadgetReference::return_data<ISMRMRD::ImageHeader>(ISMRMRD::ImageHeader, boost::python::object);
+}
diff --git a/gadgets/python/GadgetReference.h b/gadgets/python/GadgetReference.h
new file mode 100644
index 0000000..0584b63
--- /dev/null
+++ b/gadgets/python/GadgetReference.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "gadgetronpython_export.h"
+
+#include <ismrmrd.h>
+#include <boost/python.hpp>
+#include <boost/python/tuple.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETSPYTHON GadgetReference
+  {
+  
+  public:
+    GadgetReference();
+    ~GadgetReference();
+  
+    int set_gadget(Gadget* g)
+    {
+      gadget_ = g;
+      return 0;
+    }
+  
+    template<class T> int return_data(T header, boost::python::object arr);
+    int return_acquisition(ISMRMRD::AcquisitionHeader acq, boost::python::object arr);
+    int return_image(ISMRMRD::ImageHeader img, boost::python::object arr);
+  
+  protected:
+    Gadget* gadget_;  
+  };
+}
diff --git a/gadgets/python/GadgetronPythonMRI.cpp b/gadgets/python/GadgetronPythonMRI.cpp
new file mode 100644
index 0000000..40595df
--- /dev/null
+++ b/gadgets/python/GadgetronPythonMRI.cpp
@@ -0,0 +1,442 @@
+#include "GadgetReference.h"
+#include <boost/python.hpp>
+//#include <numpy/arrayobject.h>
+
+#include "../mri_core/GadgetMRIHeaders.h"
+
+#include <ismrmrd.h>
+
+using namespace boost::python;
+
+void acq_set_physiology_time_stamp(ISMRMRD::AcquisitionHeader &h, unsigned short i, uint32_t v)
+{
+	if (i < 3) {
+		h.physiology_time_stamp[i] = v;
+	}
+}
+
+uint32_t acq_get_physiology_time_stamp(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.physiology_time_stamp[i];
+	}
+	return 0;
+}
+
+void acq_set_channel_mask(ISMRMRD::AcquisitionHeader &h, unsigned short i, uint64_t v)
+{
+	if (i < 16) {
+		h.channel_mask[i] = v;
+	}
+}
+
+uint64_t acq_get_channel_mask(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 16) {
+		return h.channel_mask[i];
+	}
+	return 0;
+}
+
+void acq_set_position(ISMRMRD::AcquisitionHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.position[i] = v;
+	}
+}
+
+float acq_get_position(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.position[i];
+	}
+	return 0.0f;
+}
+
+void acq_set_read_dir(ISMRMRD::AcquisitionHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.read_dir[i] = v;
+	}
+}
+
+float acq_get_read_dir(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.read_dir[i];
+	}
+	return 0.0f;
+}
+
+void acq_set_phase_dir(ISMRMRD::AcquisitionHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.phase_dir[i] = v;
+	}
+}
+
+float acq_get_phase_dir(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.phase_dir[i];
+	}
+	return 0.0f;
+}
+
+void acq_set_slice_dir(ISMRMRD::AcquisitionHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.slice_dir[i] = v;
+	}
+}
+
+float acq_get_slice_dir(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.slice_dir[i];
+	}
+	return 0.0f;
+}
+
+void acq_set_patient_table_position(ISMRMRD::AcquisitionHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.patient_table_position[i] = v;
+	}
+}
+
+float acq_get_patient_table_position(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.patient_table_position[i];
+	}
+	return 0.0f;
+}
+
+void acq_set_user_int(ISMRMRD::AcquisitionHeader &h, unsigned short i, int32_t v)
+{
+	if (i < 8) {
+		h.user_int[i] = v;
+	}
+}
+
+int32_t acq_get_user_int(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 8) {
+		return h.user_int[i];
+	}
+	return 0;
+}
+
+void acq_set_user_float(ISMRMRD::AcquisitionHeader &h, unsigned short i, float v)
+{
+	if (i < 8) {
+		h.user_float[i] = v;
+	}
+}
+
+float acq_get_user_float(ISMRMRD::AcquisitionHeader &h, unsigned short i)
+{
+	if (i < 8) {
+		return h.user_float[i];
+	}
+	return 0.0f;
+}
+
+void acq_set_encoding_user(ISMRMRD::EncodingCounters&e, unsigned short i, uint16_t v)
+{
+	if (i < 8) {
+		e.user[i] = v;
+	}
+}
+
+uint16_t acq_get_encoding_user(ISMRMRD::EncodingCounters&e, unsigned short i)
+{
+	if (i < 8) {
+		return e.user[i];
+	}
+	return 0;
+}
+
+void img_set_matrix_size(ISMRMRD::ImageHeader &h, unsigned short i, uint16_t v)
+{
+	if (i < 3) {
+		h.matrix_size[i] = v;
+	}
+}
+
+uint16_t img_get_matrix_size(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.matrix_size[i];
+	}
+	return 0;
+}
+
+void img_set_physiology_time_stamp(ISMRMRD::ImageHeader &h, unsigned short i, uint32_t v)
+{
+	if (i < 3) {
+		h.physiology_time_stamp[i] = v;
+	}
+}
+
+uint32_t img_get_physiology_time_stamp(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.physiology_time_stamp[i];
+	}
+	return 0;
+}
+
+void img_set_position(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.position[i] = v;
+	}
+}
+
+float img_get_position(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.position[i];
+	}
+	return 0.0f;
+}
+
+void img_set_read_dir(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.read_dir[i] = v;
+	}
+}
+
+float img_get_read_dir(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.read_dir[i];
+	}
+	return 0.0f;
+}
+
+void img_set_phase_dir(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.phase_dir[i] = v;
+	}
+}
+
+float img_get_phase_dir(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.phase_dir[i];
+	}
+	return 0.0f;
+}
+
+void img_set_slice_dir(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.slice_dir[i] = v;
+	}
+}
+
+float img_get_slice_dir(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.slice_dir[i];
+	}
+	return 0.0f;
+}
+
+void img_set_patient_table_position(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.patient_table_position[i] = v;
+	}
+}
+
+float img_get_patient_table_position(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.patient_table_position[i];
+	}
+	return 0.0f;
+}
+
+
+void img_set_user_int(ISMRMRD::ImageHeader &h, unsigned short i, int32_t v)
+{
+	if (i < 8) {
+		h.user_int[i] = v;
+	}
+}
+
+int32_t img_get_user_int(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 8) {
+		return h.user_int[i];
+	}
+	return 0;
+}
+
+void img_set_user_float(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 8) {
+		h.user_float[i] = v;
+	}
+}
+
+float img_get_user_float(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 8) {
+		return h.user_float[i];
+	}
+	return 0.0f;
+}
+
+void img_set_field_of_view(ISMRMRD::ImageHeader &h, unsigned short i, float v)
+{
+	if (i < 3) {
+		h.field_of_view[i] = v;
+	}
+}
+
+float img_get_field_of_view(ISMRMRD::ImageHeader &h, unsigned short i)
+{
+	if (i < 3) {
+		return h.field_of_view[i];
+	}
+	return 0.0f;
+}
+
+BOOST_PYTHON_MODULE(GadgetronPythonMRI)
+{
+
+	//import_array();
+	boost::python::numeric::array::set_module_and_type("numpy", "ndarray");
+
+
+	class_<ISMRMRD::EncodingCounters>("EncodingCounters")
+			.def_readwrite("kspace_encode_step_1",   &ISMRMRD::EncodingCounters::kspace_encode_step_1)
+			.def_readwrite("kspace_encode_step_2",   &ISMRMRD::EncodingCounters::kspace_encode_step_2)
+			.def_readwrite("average",                &ISMRMRD::EncodingCounters::average)
+			.def_readwrite("slice",                  &ISMRMRD::EncodingCounters::slice)
+			.def_readwrite("contrast",               &ISMRMRD::EncodingCounters::contrast)
+			.def_readwrite("phase",                  &ISMRMRD::EncodingCounters::phase)
+			.def_readwrite("repetition",             &ISMRMRD::EncodingCounters::repetition)
+			.def_readwrite("segment",                &ISMRMRD::EncodingCounters::segment)
+			;
+
+	def("acq_set_physiology_time_stamp", acq_set_physiology_time_stamp);
+	def("acq_get_physiology_time_stamp", acq_get_physiology_time_stamp);
+	def("acq_set_channel_mask", acq_set_channel_mask);
+	def("acq_get_channel_mask", acq_get_channel_mask);
+	def("acq_set_position",acq_set_position);
+	def("acq_get_position",acq_get_position);
+	def("acq_set_read_dir",acq_set_read_dir);
+	def("acq_get_read_dir",acq_get_read_dir);
+	def("acq_set_phase_dir",acq_set_phase_dir);
+	def("acq_get_phase_dir",acq_get_phase_dir);
+	def("acq_set_slice_dir",acq_set_slice_dir);
+	def("acq_get_slice_dir",acq_get_slice_dir);
+	def("acq_set_patient_table_position", acq_set_patient_table_position);
+	def("acq_get_patient_table_position", acq_get_patient_table_position);
+	def("acq_set_user_int", acq_set_user_int);
+	def("acq_get_user_int", acq_get_user_int);
+	def("acq_set_user_float", acq_set_user_float);
+	def("acq_get_user_float", acq_get_user_float);
+	def("acq_set_encoding_user", acq_set_encoding_user);
+	def("acq_get_encoding_user", acq_get_encoding_user);
+
+	def("img_set_physiology_time_stamp", img_set_physiology_time_stamp);
+	def("img_get_physiology_time_stamp", img_get_physiology_time_stamp);
+	def("img_set_position",img_set_position);
+	def("img_get_position",img_get_position);
+	def("img_set_read_dir",img_set_read_dir);
+	def("img_get_read_dir",img_get_read_dir);
+	def("img_set_phase_dir",img_set_phase_dir);
+	def("img_get_phase_dir",img_get_phase_dir);
+	def("img_set_slice_dir",img_set_slice_dir);
+	def("img_get_slice_dir",img_get_slice_dir);
+	def("img_set_patient_table_position", img_set_patient_table_position);
+	def("img_get_patient_table_position", img_get_patient_table_position);
+	def("img_set_user_int", img_set_user_int);
+	def("img_get_user_int", img_get_user_int);
+	def("img_set_user_float", img_set_user_float);
+	def("img_get_user_float", img_get_user_float);
+	def("img_get_field_of_view", img_get_field_of_view);
+	def("img_set_field_of_view", img_set_field_of_view);
+	def("img_get_matrix_size", img_get_matrix_size);
+	def("img_set_matrix_size", img_set_matrix_size);
+
+	class_<ISMRMRD::AcquisitionHeader>("AcquisitionHeader")
+			.def_readwrite("version",                &ISMRMRD::AcquisitionHeader::version)
+			.def_readwrite("flags",                  &ISMRMRD::AcquisitionHeader::flags)
+			.def_readwrite("measurement_uid",        &ISMRMRD::AcquisitionHeader::measurement_uid)
+			.def_readwrite("scan_counter",           &ISMRMRD::AcquisitionHeader::scan_counter)
+			.def_readwrite("acquisition_time_stamp", &ISMRMRD::AcquisitionHeader::acquisition_time_stamp)
+			.def_readwrite("number_of_samples",      &ISMRMRD::AcquisitionHeader::number_of_samples)
+			.def_readwrite("available_channels",     &ISMRMRD::AcquisitionHeader::available_channels)
+			.def_readwrite("active_channels",        &ISMRMRD::AcquisitionHeader::active_channels)
+			.def_readwrite("discard_pre",            &ISMRMRD::AcquisitionHeader::discard_pre)
+			.def_readwrite("discard_post",           &ISMRMRD::AcquisitionHeader::discard_post)
+			.def_readwrite("centre_sample",          &ISMRMRD::AcquisitionHeader::center_sample)
+			.def_readwrite("encoding_space_ref",     &ISMRMRD::AcquisitionHeader::encoding_space_ref)
+			.def_readwrite("trajectory_dimensions",  &ISMRMRD::AcquisitionHeader::trajectory_dimensions)
+			.def_readwrite("sample_time_us",         &ISMRMRD::AcquisitionHeader::sample_time_us)
+			.def_readwrite("idx",                    &ISMRMRD::AcquisitionHeader::idx)
+			;
+
+
+	class_<ISMRMRD::ImageHeader>("ImageHeader")
+			.def_readwrite("flags", &ISMRMRD::ImageHeader::flags)
+			.def_readwrite("channels", &ISMRMRD::ImageHeader::channels)
+			.def_readwrite("slice", &ISMRMRD::ImageHeader::slice)
+			.def_readwrite("contrast", &ISMRMRD::ImageHeader::contrast)
+			.def_readwrite("set", &ISMRMRD::ImageHeader::set)
+			.def_readwrite("phase", &ISMRMRD::ImageHeader::phase)
+			.def_readwrite("average", &ISMRMRD::ImageHeader::average)
+			.def_readwrite("repetition", &ISMRMRD::ImageHeader::repetition)
+			.def_readwrite("acquisition_time_stamp", &ISMRMRD::ImageHeader::acquisition_time_stamp)
+			.def_readwrite("image_data_type", &ISMRMRD::ImageHeader::image_data_type)
+			.def_readwrite("image_type", &ISMRMRD::ImageHeader::image_type)
+			.def_readwrite("image_index", &ISMRMRD::ImageHeader::image_index)
+			.def_readwrite("image_series_index", &ISMRMRD::ImageHeader::image_series_index)
+			;
+
+	class_<Gadgetron::GadgetReference>("GadgetReference")
+    		.def("return_acquisition", &Gadgetron::GadgetReference::return_data<ISMRMRD::AcquisitionHeader>)
+    		.def("return_image", &Gadgetron::GadgetReference::return_data<ISMRMRD::ImageHeader>)
+
+    		;
+
+	enum_<ISMRMRD::ImageDataType>("ImageDataType")
+    		   .value("DATA_COMPLEX_FLOAT", ISMRMRD::DATA_COMPLEX_FLOAT)
+    		   .value("DATA_FLOAT", ISMRMRD::DATA_FLOAT)
+    		   .value("DATA_UNSIGNED_SHORT", ISMRMRD::DATA_UNSIGNED_SHORT)
+    		   ;
+
+
+	enum_<ISMRMRD::ImageType>("ImageType")
+				  .value("TYPE_MAGNITUDE",ISMRMRD::TYPE_MAGNITUDE)
+				  .value("TYPE_PHASE", ISMRMRD::TYPE_PHASE)
+				  .value("TYPE_REAL",ISMRMRD::TYPE_REAL)
+				  .value("TYPE_IMAG",ISMRMRD::TYPE_IMAG)
+				  ;
+
+	enum_<Gadgetron::GadgetMessageID>("GadgetMessageID")
+				  .value("GADGET_MESSAGE_EXT_ID_MIN",Gadgetron::GADGET_MESSAGE_EXT_ID_MIN)
+				  .value("GADGET_MESSAGE_ACQUISITION",Gadgetron::GADGET_MESSAGE_ACQUISITION)
+				  .value("GADGET_MESSAGE_NEW_MEASUREMENT",Gadgetron::GADGET_MESSAGE_NEW_MEASUREMENT)
+				  .value("GADGET_MESSAGE_END_OF_SCAN",Gadgetron::GADGET_MESSAGE_END_OF_SCAN)
+				  .value("GADGET_MESSAGE_IMAGE_CPLX_FLOAT",Gadgetron::GADGET_MESSAGE_IMAGE_CPLX_FLOAT)
+				  .value("GADGET_MESSAGE_IMAGE_REAL_FLOAT",Gadgetron::GADGET_MESSAGE_IMAGE_REAL_FLOAT)
+				  .value("GADGET_MESSAGE_IMAGE_REAL_USHORT",Gadgetron::GADGET_MESSAGE_IMAGE_REAL_USHORT)
+				  .value("GADGET_MESSAGE_ISMRMRD_ACQUISITION", Gadgetron::GADGET_MESSAGE_ISMRMRD_ACQUISITION)
+				  .value("GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT", Gadgetron::GADGET_MESSAGE_ISMRMRD_IMAGE_CPLX_FLOAT)
+				  .value("GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT", Gadgetron::GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_FLOAT)
+				  .value("GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT", Gadgetron::GADGET_MESSAGE_ISMRMRD_IMAGE_REAL_USHORT)
+				  .value("GADGET_MESSAGE_EMPTY",Gadgetron::GADGET_MESSAGE_EMPTY)
+				  .value("GADGET_MESSAGE_EXT_ID_MAX",Gadgetron::GADGET_MESSAGE_EXT_ID_MAX)
+				  ;
+}
diff --git a/gadgets/python/GadgetronXML.py b/gadgets/python/GadgetronXML.py
new file mode 100644
index 0000000..b0aba02
--- /dev/null
+++ b/gadgets/python/GadgetronXML.py
@@ -0,0 +1,59 @@
+import xml.dom.minidom
+import numpy as np
+
+def getParameter(dom, path):
+    ret_value = [];
+    path_element = path.split(".");
+    
+    node = dom.getElementsByTagName(path_element[0])
+    level = 1;
+    while ((node.__len__()) > 0 and (level < path_element.__len__())):
+        node = node[0].getElementsByTagName(path_element[level])
+        level = level + 1
+
+    for it in node:
+        ret_value.append(it.childNodes[0].nodeValue)
+    
+    #Make sure there is one empty value to enable [0] to work
+    if (ret_value.__len__() == 0):
+        ret_value.append("")
+
+    return ret_value;
+
+#def getParameterFromSection(XMLstr, section, parameter):
+#    ret_val = "0"
+#    dom = xml.dom.minidom.parseString(XMLstr)
+#    sec = dom.getElementsByTagName(section)[0].getElementsByTagName("parameter")
+#    for i in range(len(sec)):
+#        if (sec[i].getAttribute("name") == parameter):
+#            ret_val = sec[i].getAttribute("value")
+#            break
+#
+#    return ret_val
+
+def getEncodingParameters(XMLstr):
+    dom = xml.dom.minidom.parseString(XMLstr)
+    enc = dict();
+
+    enc["trajectory"]                = int(getParameter(dom,"ismrmrdHeader.encoding.trajectory")[0])
+    enc["matrix_x"]                  = int(getParameter(dom,"ismrmrdHeader.encoding.encoded_space.matrix_size.x")[0])
+    enc["matrix_y"]                  = int(getParameter(dom,"ismrmrdHeader.encoding.encoded_space.matrix_size.x")[0])
+    
+    if (np.size(getParameter(dom,"gadgetron.encoding.kspace.matrix_size.value")) < 3):
+	enc["matrix_z"] = 0
+    else:
+    	enc["matrix_z"]                  = int(getParameter(dom,"gadgetron.encoding.kspace.matrix_size.value")[2])
+
+    if (enc["matrix_z"] == 0):
+        enc["matrix_z"] = 1
+
+    enc["readout_length"]            = int(getParameter(dom,"gadgetron.encoding.kspace.readout_length.value")[0])
+    enc["channels"]                  = int(getParameter(dom,"gadgetron.encoding.channels.value")[0])
+    enc["base_resolution"]           = int(getParameter(dom,"gadgetron.encoding.kspace.base_resolution.value")[0])
+    enc["phase_encoding_lines"]      = int(getParameter(dom,"gadgetron.encoding.kspace.phase_encoding_lines.value")[0])
+    enc["slices"]                    = int(getParameter(dom,"gadgetron.encoding.slices.value")[0])
+    #enc["noise_dwell_time_us"]       = float(getParameter(dom,"gadgetron.encoding.noise_dwell_time_us.value")[0])
+    #enc["acquisition_dwell_time_us"] = float(getParameter(dom,"gadgetron.encoding.acquisition_dwell_time_us.value")[0])
+    #enc["receiver_noise_bandwidth"]  = float(getParameter(dom,"gadgetron.encoding.receiver_noise_bandwidth.value")[0])
+
+    return enc
diff --git a/gadgets/python/PythonCommunicator.cpp b/gadgets/python/PythonCommunicator.cpp
new file mode 100644
index 0000000..c5cb67e
--- /dev/null
+++ b/gadgets/python/PythonCommunicator.cpp
@@ -0,0 +1,212 @@
+#include "PythonCommunicator.h"
+#include "../mri_core/GadgetMRIHeaders.h"
+
+#include <numpy/numpyconfig.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+#include <boost/algorithm/string.hpp>
+#include <ismrmrd.h>
+
+namespace Gadgetron{
+PythonCommunicator::PythonCommunicator()
+{
+	Py_Initialize();
+	_import_array();
+
+	PyEval_InitThreads();
+
+	//Swap out and return current thread state and release the GIL
+	//Must be done, otherwise subsequent calls to PyGILState_Ensure() will not be guaranteed to acuire lock
+	PyThreadState* tstate = PyEval_SaveThread();
+	if (!tstate) {
+		GADGET_DEBUG1("Error occurred returning lock to Python\n");
+	}
+
+
+	//Let's first get the path set for the library folder
+	const char* gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+	std::string path_name = std::string(gadgetron_home) + std::string("/lib");
+
+	if (gadgetron_home != 0) {
+		if (addPath(path_name) == GADGET_FAIL) {
+			GADGET_DEBUG2("PythonCommunicator (constructor) failed to add path %s\n", path_name.c_str());
+		}
+	}
+
+}
+
+PythonCommunicator::~PythonCommunicator()
+{
+
+}
+
+int PythonCommunicator::addPath(std::string path)
+{
+
+	PyGILState_STATE gstate;
+	gstate = PyGILState_Ensure();
+
+	std::string path_cmd;
+	if (path.size() > 0) {
+		std::vector<std::string> paths;
+		boost::split(paths, path, boost::is_any_of(";"));
+		for (unsigned int i = 0; i < paths.size(); i++) {
+			path_cmd = std::string("import sys;\nif (sys.path.count(\"") + paths[i] +
+					std::string("\") == 0):\n\tsys.path.append(\"") + paths[i] + std::string("\")\n");
+			//GADGET_DEBUG2("Executing path command:\n%s\n", path_cmd.c_str());
+			boost::python::exec(path_cmd.c_str(),boost::python::import("__main__").attr("__dict__"));
+		}
+	}
+
+	PyGILState_Release(gstate);
+	return GADGET_OK;
+}
+
+int PythonCommunicator::registerGadget(Gadget* g, std::string mod, 
+		std::string ref, std::string conf,
+		std::string process)
+{
+
+	PyGILState_STATE gstate;
+
+	if (!g) {
+		GADGET_DEBUG1("PythonCommunicator::registerGadget: Received null gadget\n");
+		return GADGET_FAIL;
+	}
+
+	gstate = PyGILState_Ensure();
+	try {
+
+		if (mod.size() != 0) {
+			module_[g] = boost::python::import(mod.c_str());
+
+			/* We will try t force a reload of the module */
+			boost::python::import("__main__").attr("__dict__")[mod.c_str()] = module_[g];
+			std::string tmp = std::string("reload(") + std::string(mod.c_str()) + std::string(")\n");
+
+			//GADGET_DEBUG2("Reloading with command: %s\n", tmp.c_str());
+			boost::python::exec(tmp.c_str(),boost::python::import("__main__").attr("__dict__"));
+
+		} else {
+			PyGILState_Release(gstate);
+			GADGET_DEBUG1("PythonCommunicator::registerGadget: Null module received\n");
+			return GADGET_FAIL;
+		}
+
+		if (ref.size() != 0) {
+			gadget_ref_fnc_[g]  = module_[g].attr(ref.c_str());
+			gadget_ref_[g] = boost::shared_ptr<GadgetReference>(new GadgetReference());
+			gadget_ref_[g]->set_gadget(g);
+			gadget_ref_fnc_[g](*gadget_ref_[g].get());
+		}
+
+		if (conf.size() != 0) {
+			config_fnc_[g] =  module_[g].attr(conf.c_str());
+		}
+
+		if (process.size() != 0) {
+			process_fnc_[g] = module_[g].attr(process.c_str());
+		}
+
+	} catch(boost::python::error_already_set const &) {
+		GADGET_DEBUG1("Error loading python modules\n");
+		PyErr_Print();
+		PyGILState_Release(gstate);
+		return GADGET_FAIL;
+	}
+	PyGILState_Release(gstate);
+
+	return GADGET_OK;
+}
+
+int PythonCommunicator::processConfig(Gadget* g, ACE_Message_Block* mb)
+{
+	PyGILState_STATE gstate;
+	std::map< Gadget*, boost::python::object >::iterator it;
+
+	if (!g) {
+		GADGET_DEBUG1("Null Gadget received");
+		return GADGET_FAIL;
+	}
+
+	it = config_fnc_.find(g);
+	if (it != config_fnc_.end()) {
+		gstate = PyGILState_Ensure();
+		try {
+			boost::python::object ignored = it->second(boost::python::object(std::string(mb->rd_ptr())));
+		}  catch(boost::python::error_already_set const &) {
+			GADGET_DEBUG2("Error calling process config function for Gadget %s\n", g->module()->name());
+			PyErr_Print();
+			PyGILState_Release(gstate);
+			return GADGET_FAIL;
+		}
+		PyGILState_Release(gstate);
+	} else {
+		GADGET_DEBUG2("No registered process function found for Gadget %s\n", g->module()->name());
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+template<class T> int PythonCommunicator::process(Gadget* g, 
+		GadgetContainerMessage<T>* m1,
+		GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+{
+
+	PyGILState_STATE gstate;
+
+	std::map< Gadget*, boost::python::object >::iterator it;
+
+	if (!g) {
+		GADGET_DEBUG1("Null Gadget received");
+		return GADGET_FAIL;
+	}
+
+
+	it = process_fnc_.find(g);
+	if (it != process_fnc_.end()) {
+		gstate = PyGILState_Ensure();
+		try {
+			std::vector<size_t> dims = (*(m2->getObjectPtr()->get_dimensions().get()));
+			std::vector<int> dims2(dims.size());
+			for (unsigned int i = 0; i < dims.size(); i++) dims2[dims.size()-i-1] = static_cast<int>(dims[i]);
+
+			boost::python::object obj(boost::python::handle<>(PyArray_FromDims(dims2.size(), &dims2[0], NPY_COMPLEX64)));
+			//boost::python::object data = boost::python::extract<boost::python::numeric::array>(obj);
+
+			//Copy data
+			memcpy(PyArray_DATA((PyArrayObject*)obj.ptr()), m2->getObjectPtr()->get_data_ptr(), m2->getObjectPtr()->get_number_of_elements()*sizeof(std::complex<float>));
+
+			//Get Header
+			T acq = *m1->getObjectPtr();
+
+			if ( boost::python::extract<int>(it->second(acq, obj)) != GADGET_OK) {
+				GADGET_DEBUG2("Gadget (%s) Returned from python call with error\n", g->module()->name());
+				PyGILState_Release(gstate);
+				return GADGET_FAIL;
+			}
+			//Else we are done with this now.
+			m1->release();
+		} catch(boost::python::error_already_set const &) {
+			GADGET_DEBUG1("Passing data on to python module failed\n");
+			PyErr_Print();
+			PyGILState_Release(gstate);
+			return GADGET_FAIL;
+		}
+		PyGILState_Release(gstate);
+	} else {
+		GADGET_DEBUG2("No registered process function found for Gadget %s\n", g->module()->name());
+		return GADGET_FAIL;
+	}
+	return GADGET_OK;
+}  
+
+
+//Template Instanciations
+template int PythonCommunicator::process(Gadget*, GadgetContainerMessage<ISMRMRD::AcquisitionHeader>*,
+		GadgetContainerMessage< hoNDArray< std::complex<float> > >*);
+
+template int PythonCommunicator::process(Gadget*, GadgetContainerMessage<ISMRMRD::ImageHeader>*,
+		GadgetContainerMessage< hoNDArray< std::complex<float> > >*);
+}
diff --git a/gadgets/python/PythonCommunicator.h b/gadgets/python/PythonCommunicator.h
new file mode 100644
index 0000000..c256b34
--- /dev/null
+++ b/gadgets/python/PythonCommunicator.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "Gadgetron.h"
+#include "Gadget.h"
+#include "hoNDArray.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetReference.h"
+#include "gadgetronpython_export.h"
+
+#include <ace/Singleton.h>
+#include <ace/Synch.h>
+
+#include <boost/python.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include <string>
+#include <complex>
+
+namespace Gadgetron{
+
+class EXPORTGADGETSPYTHON PythonCommunicator
+{
+
+ public:
+  PythonCommunicator();
+  ~PythonCommunicator();
+
+  int addPath(std::string path);
+
+  int registerGadget(Gadget* g, std::string mod, 
+		     std::string ref, std::string conf,
+		     std::string process);
+
+  int processConfig(Gadget* g, ACE_Message_Block* mb);
+
+  template<class T> int process(Gadget* g, 
+				GadgetContainerMessage<T>* m1,
+				GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2);
+
+ private:
+  std::map<Gadget*, boost::python::object> module_;
+  std::map<Gadget*, boost::python::object> gadget_ref_fnc_;
+  std::map<Gadget*, boost::python::object> config_fnc_;
+  std::map<Gadget*, boost::python::object> process_fnc_;
+  std::map<Gadget*, boost::shared_ptr<GadgetReference> > gadget_ref_;
+
+};
+
+typedef ACE_Singleton<PythonCommunicator, ACE_Thread_Mutex> PythonCommunicatorSingleton;
+}
diff --git a/gadgets/python/PythonGadget.cpp b/gadgets/python/PythonGadget.cpp
new file mode 100644
index 0000000..cf3f5b1
--- /dev/null
+++ b/gadgets/python/PythonGadget.cpp
@@ -0,0 +1,6 @@
+#include "PythonGadget.h"
+
+namespace Gadgetron{
+  GADGET_FACTORY_DECLARE(AcquisitionPythonGadget)
+  GADGET_FACTORY_DECLARE(ImagePythonGadget)
+}
diff --git a/gadgets/python/PythonGadget.h b/gadgets/python/PythonGadget.h
new file mode 100644
index 0000000..0731eff
--- /dev/null
+++ b/gadgets/python/PythonGadget.h
@@ -0,0 +1,99 @@
+#pragma once 
+
+#include "Gadget.h"
+#include "Gadgetron.h"
+#include "hoNDArray.h"
+#include "GadgetMRIHeaders.h"
+#include "PythonCommunicator.h"
+#include "gadgetronpython_export.h"
+
+#include <ismrmrd.h>
+#include <boost/python.hpp>
+#include <boost/algorithm/string.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  template <class T> class PythonGadget : 
+  public Gadget2<T, hoNDArray< std::complex<float> > >
+    {
+    protected:
+
+      int process_config(ACE_Message_Block* mb)
+      {
+	communicator_ = PythonCommunicatorSingleton::instance();
+
+	boost::shared_ptr<std::string> pypath        = this->get_string_value("python_path");
+	boost::shared_ptr<std::string> pymod         = this->get_string_value("python_module");
+	boost::shared_ptr<std::string> pyreffunc     = this->get_string_value("gadget_reference_function");
+	boost::shared_ptr<std::string> pydatafunc    = this->get_string_value("input_function");
+	boost::shared_ptr<std::string> pyconfigfunc  = this->get_string_value("config_function");
+
+	GADGET_DEBUG2("Python Module          : %s\n", pymod.get()->c_str());
+	GADGET_DEBUG2("Python Ref Function    : %s\n", pyreffunc.get()->c_str());
+	GADGET_DEBUG2("Python Data Function   : %s\n", pydatafunc.get()->c_str());
+	GADGET_DEBUG2("Python Config Function : %s\n", pyconfigfunc.get()->c_str());
+
+	if (communicator_->addPath(*pypath.get()) != GADGET_OK) {
+	  GADGET_DEBUG2("Failed to add paths in Gadget %s\n", this->module()->name());
+	  return GADGET_FAIL;
+	}
+
+	if (communicator_->registerGadget(this, *pymod.get(),
+					  *pyreffunc.get(), *pyconfigfunc.get(),
+					  *pydatafunc.get()) != GADGET_OK) {
+	  GADGET_DEBUG2("Failed to register Gadget (%s) with PythonCommunicator\n", this->module()->name());
+	  return GADGET_FAIL;
+	}
+
+	if (communicator_->processConfig(this, mb) != GADGET_OK) {
+	  GADGET_DEBUG2("Failed to process config in Python module of Gadget (%s)\n", this->module()->name());
+	  return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+      }
+
+      int process(GadgetContainerMessage<T>* m1,
+		  GadgetContainerMessage< hoNDArray< std::complex<float> > >* m2)
+      {
+    
+	//We want to avoid a deadlock for the Python GIL if this python call results in an output that the GadgetReference will not be able to get rid of.
+	//This is kind of a nasty busy wait, maybe we should add an event handler to the NotificationStrategy of the Q or something, but for now, this will do it.
+	while (this->next()->msg_queue()->is_full()) {
+	  //GADGET_DEBUG2("Gadget (%s) sleeping while downstream Gadget (%s) does some work\n", this->module()->name(), this->next()->module()->name());
+	  ACE_Time_Value tv(0,10000); //Sleep for 10ms while the downstream Gadget does some work
+	  ACE_OS::sleep(tv);
+	}
+
+	//GADGET_DEBUG2("Process called in Gadget (%s)\n", this->module()->name());
+	if (communicator_->process(this,m1,m2) != GADGET_OK) {
+	  GADGET_DEBUG2("Failed to process data for Gadget (%s)\n", this->module()->name());
+	  return GADGET_FAIL;
+	}
+
+	//GADGET_DEBUG2("Process done in Gadget (%s)\n", this->module()->name());
+	return GADGET_OK;
+      }
+  
+    private:
+      PythonCommunicator* communicator_;
+    };
+  
+  class EXPORTGADGETSPYTHON AcquisitionPythonGadget :
+  public PythonGadget<ISMRMRD::AcquisitionHeader>
+  {
+  public:
+    GADGET_DECLARE(AcquisitionPythonGadget);
+  
+  };
+
+  class EXPORTGADGETSPYTHON ImagePythonGadget :
+  public PythonGadget<ISMRMRD::ImageHeader>
+  {
+  public:
+    GADGET_DECLARE(ImagePythonGadget);    
+  };
+}
diff --git a/gadgets/python/accumulate_and_recon.py b/gadgets/python/accumulate_and_recon.py
new file mode 100644
index 0000000..6c89e27
--- /dev/null
+++ b/gadgets/python/accumulate_and_recon.py
@@ -0,0 +1,83 @@
+import numpy as np
+import GadgetronPythonMRI as g
+import kspaceandimage as ki
+import libxml2
+
+myLocalGadgetReference = g.GadgetReference()
+myBuffer = 0
+myParameters = 0
+myCounter = 1;
+mySeries = 1;
+
+def set_gadget_reference(gadref):
+    global myLocalGadgetReference
+    myLocalGadgetReference = gadref
+
+def config_function(conf):
+    global myBuffer
+    global myParameters
+
+    myParameters = dict()
+
+    doc = libxml2.parseDoc(str(conf))
+    context = doc.xpathNewContext()
+    context.xpathRegisterNs("ismrm", "http://www.ismrm.org/ISMRMRD")
+    myParameters["matrix_x"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodedSpace/ismrm:matrixSize/ismrm:x")[0]).content)
+    myParameters["matrix_y"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodedSpace/ismrm:matrixSize/ismrm:y")[0]).content)
+    myParameters["matrix_z"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodedSpace/ismrm:matrixSize/ismrm:z")[0]).content)
+    myParameters["channels"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:acquisitionSystemInformation/ismrm:receiverChannels")[0]).content)
+    myParameters["slices"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodingLimits/ismrm:slice/ismrm:maximum")[0]).content)+1
+    myParameters["center_line"] = int((context.xpathEval("/ismrm:ismrmrdHeader/ismrm:encoding/ismrm:encodingLimits/ismrm:kspace_encoding_step_1/ismrm:center")[0]).content)
+
+    myBuffer = (np.zeros((myParameters["channels"],myParameters["slices"],myParameters["matrix_z"],myParameters["matrix_y"],(myParameters["matrix_x"]>>1)))).astype('complex64')
+
+def recon_function(acq, data):
+    global myLocalGadgetReference
+    global myBuffer
+    global myParameters
+    global myCounter
+    global mySeries
+
+    line_offset = (myParameters["matrix_y"]>>1)-myParameters["center_line"];
+    myBuffer[:,acq.idx.slice,acq.idx.kspace_encode_step_2,acq.idx.kspace_encode_step_1+line_offset,:] = data
+    
+    if (acq.flags & (1<<7)): #Is this the last scan in slice
+        image = ki.ktoi(myBuffer,(2,3,4))
+        image = image * np.product(image.shape)*100 #Scaling for the scanner
+        #Create a new image header and transfer value
+        img_head = g.ImageHeader()
+        img_head.channels = acq.active_channels
+        img_head.slice = acq.idx.slice
+        g.img_set_matrix_size(img_head, 0, myBuffer.shape[4])
+        g.img_set_matrix_size(img_head, 1, myBuffer.shape[3])
+        g.img_set_matrix_size(img_head, 2, myBuffer.shape[2])
+        g.img_set_position(img_head, 0,g.acq_get_position(acq,0))
+        g.img_set_position(img_head, 1,g.acq_get_position(acq,1))
+        g.img_set_position(img_head, 2,g.acq_get_position(acq,2))
+        g.img_set_read_dir(img_head, 0, g.acq_get_read_dir(acq, 0))
+        g.img_set_read_dir(img_head, 1, g.acq_get_read_dir(acq, 1))
+        g.img_set_read_dir(img_head, 2, g.acq_get_read_dir(acq, 2))
+        g.img_set_phase_dir(img_head, 0, g.acq_get_phase_dir(acq, 0))
+        g.img_set_phase_dir(img_head, 1, g.acq_get_phase_dir(acq, 1))
+        g.img_set_phase_dir(img_head, 2, g.acq_get_phase_dir(acq, 2))
+        g.img_set_slice_dir(img_head, 0, g.acq_get_slice_dir(acq, 0))
+        g.img_set_slice_dir(img_head, 1, g.acq_get_slice_dir(acq, 1))
+        g.img_set_slice_dir(img_head, 2, g.acq_get_slice_dir(acq, 2))
+	g.img_set_patient_table_position(img_head, 0, g.acq_get_patient_table_position(acq,0))
+	g.img_set_patient_table_position(img_head, 1, g.acq_get_patient_table_position(acq,1))
+	g.img_set_patient_table_position(img_head, 2, g.acq_get_patient_table_position(acq,2))
+        img_head.acquisition_time_stamp = acq.acquisition_time_stamp
+	img_head.image_index = myCounter;
+	img_head.image_series_index = mySeries;
+
+	myCounter = myCounter + 1
+	if (myCounter > 5):
+		mySeries = mySeries + 1
+		myCounter = 1
+
+        #Return image to Gadgetron
+	return myLocalGadgetReference.return_image(img_head,image.astype('complex64'))
+	
+    #print "Returning to Gadgetron"
+    return 0 #Everything OK
+
diff --git a/gadgets/python/gadgetronpython_export.h b/gadgets/python/gadgetronpython_export.h
new file mode 100644
index 0000000..b84050a
--- /dev/null
+++ b/gadgets/python/gadgetronpython_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRONPYTHON_EXPORT_H_
+#define GADGETRONPYTHON_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_PYTHON__) || defined (gadgetronpython_EXPORTS)
+#define EXPORTGADGETSPYTHON __declspec(dllexport)
+#else
+#define EXPORTGADGETSPYTHON __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETSPYTHON
+#endif
+
+#endif /* GADGETRONPYTHON_EXPORT_H_ */
diff --git a/gadgets/python/image_viewer.py b/gadgets/python/image_viewer.py
new file mode 100644
index 0000000..5d40ea1
--- /dev/null
+++ b/gadgets/python/image_viewer.py
@@ -0,0 +1,79 @@
+import GadgetronPythonMRI as g
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure   
+
+#from matplotlib.axes import Subplot   
+# uncomment to select /GTK/GTKAgg/GTKCairo
+from matplotlib.backends.backend_gtk import FigureCanvasGTK as FigureCanvas
+#from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas
+#from matplotlib.backends.backend_gtkcairo import FigureCanvasGTKCairo as FigureCanvas
+
+# or NavigationToolbar for classic
+#from matplotlib.backends.backend_gtk import NavigationToolbar2GTK as NavigationToolbar
+from matplotlib.backends.backend_gtkagg import NavigationToolbar2GTKAgg as NavigationToolbar
+
+import pygtk
+pygtk.require('2.0')
+import gtk
+
+gadget_ref = g.GadgetReference()
+
+class ImageViewer:
+    def __init__(self, img_data):
+        self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
+        self.window.connect("delete_event", self.delete_event)
+        self.window.connect('key_press_event', self.on_key_press_event)
+        self.window.set_default_size(400,300)
+        self.window.set_title("Gadgetron Image Viewer")
+
+        self.vbox = gtk.VBox()
+        self.window.add(self.vbox)
+
+        self.fig = Figure(figsize=(5,4), dpi=100)
+        
+        plt.gray()
+
+        self.ax = self.fig.add_subplot(111)
+        self.img_ax = self.ax.imshow(np.squeeze(np.abs(img_data)))
+
+        self.canvas = FigureCanvas(self.fig)  # a gtk.DrawingArea
+        self.vbox.pack_start(self.canvas)
+        self.toolbar = NavigationToolbar(self.canvas, self.window)
+        self.vbox.pack_start(self.toolbar, False, False)
+        self.window.show_all()
+
+    def delete_event(self, widget, event, data=None):
+        gtk.main_quit()
+        return False
+   
+    def on_key_press_event(self, widget, event, data=None):
+        keyname = gtk.gdk.keyval_name(event.keyval)
+        if (keyname == "Escape"):
+            self.window.destroy()
+            gtk.main_quit()
+            return False
+
+    def main(self):
+        gtk.main()
+
+
+def set_gadget_reference(ref):
+    global gadget_ref
+    gadget_ref = ref
+
+def config_function(cfg):
+    global myWindow
+    print "Attempting to open window"
+    print "Window running"
+    #Configuration Ignored
+
+def recon_function(h,im):
+    global gadget_ref
+
+    myWindow = ImageViewer(im) 
+    myWindow.main()
+
+    return gadget_ref.return_image(h,im.astype('complex64'))
+
+
diff --git a/gadgets/python/kspaceandimage.py b/gadgets/python/kspaceandimage.py
new file mode 100644
index 0000000..6c42e90
--- /dev/null
+++ b/gadgets/python/kspaceandimage.py
@@ -0,0 +1,20 @@
+import numpy as np
+import numpy.fft as ft
+import numpy.fft.helper as fth
+
+def ktoi(data,axis=-1):
+    if (axis == -1):
+        ax = fth.arange(0,data.ndim)
+    else:
+        ax = axis
+
+    return fth.fftshift(ft.ifftn(fth.ifftshift(data,axes=ax),axes=ax),axes=ax)
+
+def itok(data,axis=-1):
+    if (axis == -1):
+        ax = fth.arange(0,data.ndim)
+    else:
+        ax = axis
+
+
+    return fth.fftshift(ft.fftn(fth.ifftshift(data,axes=ax),axes=ax),axes=ax)
diff --git a/gadgets/python/python.xml b/gadgets/python/python.xml
new file mode 100644
index 0000000..0396494
--- /dev/null
+++ b/gadgets/python/python.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveOversamplingPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>AcquisitionPythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>remove_2x_oversampling</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <gadget>
+      <name>AccReconPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>AcquisitionPythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>accumulate_and_recon</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <gadget>
+      <name>CoilCombinePython</name>
+      <dll>gadgetron_python</dll>
+      <classname>ImagePythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>rms_coil_combine</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <gadget>
+      <name>ImageViewPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>ImagePythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>image_viewer</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+     </gadget>
+  
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/python/python_short.xml b/gadgets/python/python_short.xml
new file mode 100644
index 0000000..3e83001
--- /dev/null
+++ b/gadgets/python/python_short.xml
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>RemoveOversamplingPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>AcquisitionPythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>remove_2x_oversampling</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <gadget>
+      <name>AccReconPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>AcquisitionPythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>accumulate_and_recon</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <gadget>
+      <name>CoilCombinePython</name>
+      <dll>gadgetron_python</dll>
+      <classname>ImagePythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>rms_coil_combine</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageViewPython</name>
+      <dll>gadgetron_python</dll>
+      <classname>ImagePythonGadget</classname>
+      <property><name>python_path</name>                  <value>/home/myuser/scripts/python</value></property>
+      <property><name>python_module</name>                <value>image_viewer</value></property>
+      <property><name>gadget_reference_function</name>    <value>set_gadget_reference</value></property>
+      <property><name>input_function</name>               <value>recon_function</value></property>
+      <property><name>config_function</name>              <value>config_function</value></property>
+    </gadget>
+    -->
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+     </gadget>
+
+     <gadget>
+      <name>Autoscale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+     </gadget>
+    
+     <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+  
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/python/remove_2x_oversampling.py b/gadgets/python/remove_2x_oversampling.py
new file mode 100644
index 0000000..91eb001
--- /dev/null
+++ b/gadgets/python/remove_2x_oversampling.py
@@ -0,0 +1,31 @@
+import numpy as np
+import GadgetronPythonMRI as g
+import kspaceandimage as ki
+import libxml2
+myGadgetReference = g.GadgetReference()
+
+def set_gadget_reference(gadref):
+    global myGadgetReference
+    myGadgetReference = gadref
+
+def config_function(conf):
+    #print "remove 2x oversampling: Configuration received"
+    #print str(conf)
+    return 
+
+def recon_function(acq, data):
+    global myVariable
+    global myGadgetReference
+
+    orig_size = list(data.shape);
+    data2 = data.reshape([(data.size/data.shape[data.ndim-1]), data.shape[data.ndim-1]])
+    new_length = data2.shape[1]>>1
+    data2 = ki.itok(ki.ktoi(data2,[1])[:,(0+(new_length>>1)):(new_length+(new_length>>1))],[1])
+    orig_size[data.ndim-1] = new_length
+    data2.reshape(tuple(orig_size))
+    acq.samples = new_length
+
+    return myGadgetReference.return_acquisition(acq,data2.astype('complex64'))
+
+    
+
diff --git a/gadgets/python/rms_coil_combine.py b/gadgets/python/rms_coil_combine.py
new file mode 100644
index 0000000..0d9a787
--- /dev/null
+++ b/gadgets/python/rms_coil_combine.py
@@ -0,0 +1,19 @@
+import GadgetronPythonMRI as g
+import numpy as np
+
+gadget_ref = g.GadgetReference()
+
+def set_gadget_reference(ref):
+    global gadget_ref
+    gadget_ref = ref
+
+def config_function(cfg):
+    print "RMS Coil Combine, Config ignored"
+
+def recon_function(h,im):
+    global gadget_ref
+    combined_image = np.sqrt(np.sum(np.square(np.abs(im)),axis=0))
+    h.channels = 1
+    return gadget_ref.return_image(h,combined_image.astype('complex64'))
+
+
diff --git a/gadgets/radial/CMakeLists.txt b/gadgets/radial/CMakeLists.txt
new file mode 100644
index 0000000..7c4b6ce
--- /dev/null
+++ b/gadgets/radial/CMakeLists.txt
@@ -0,0 +1,39 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_RADIAL__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/sense
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+  )
+
+if(CUDA_FOUND)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  
+  add_library(gadgetron_gpuradial SHARED 
+    gpuRadialSensePrepGadget.cpp 
+    ${ISMRMRD_XSD_SOURCE})
+  
+  target_link_libraries(gadgetron_gpuradial
+    gpunfft gpusolvers gpuoperators gpuparallelmri cpucore gpucore
+    ${ISMRMRD_LIBRARIES} ${XERCESC_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES}
+    optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+    )
+  
+  install (TARGETS gadgetron_gpuradial DESTINATION lib)
+endif(CUDA_FOUND)
+
+add_subdirectory(config)
diff --git a/gadgets/radial/config/CMakeLists.txt b/gadgets/radial/config/CMakeLists.txt
new file mode 100644
index 0000000..3b4c966
--- /dev/null
+++ b/gadgets/radial/config/CMakeLists.txt
@@ -0,0 +1,28 @@
+if (ARMADILLO_FOUND)
+  install (FILES 
+    fixed_radial_mode0_realtime.xml 
+    fixed_radial_mode1_realtime.xml 
+    golden_radial_mode2_realtime.xml 
+    fixed_radial_mode0_gpusense_cg.xml 
+    fixed_radial_mode1_gpusense_cg.xml 
+    golden_radial_mode2_gpusense_cg.xml 
+    fixed_radial_mode0_gpusense_sb.xml 
+    fixed_radial_mode1_gpusense_sb.xml 
+    golden_radial_mode2_gpusense_sb.xml 
+    golden_radial_mode3_gpusense_sb.xml 
+    fixed_radial_mode0_gpu_ktsense.xml 
+    fixed_radial_mode1_gpu_ktsense.xml 
+    golden_radial_mode2_gpu_ktsense.xml 
+    DESTINATION config)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, only unoptimized radial config files will be available")
+endif (ARMADILLO_FOUND)
+
+install (FILES 
+  fixed_radial_mode0_gpusense_cg_unoptimized.xml 
+  fixed_radial_mode1_gpusense_cg_unoptimized.xml 
+  golden_radial_mode2_gpusense_cg_unoptimized.xml 
+  fixed_radial_mode0_gpusense_sb_unoptimized.xml 
+  fixed_radial_mode1_gpusense_sb_unoptimized.xml 
+  golden_radial_mode2_gpusense_sb_unoptimized.xml 
+  DESTINATION config)
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpu_ktsense.xml b/gadgets/radial/config/fixed_radial_mode0_gpu_ktsense.xml
new file mode 100644
index 0000000..971d86a
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpu_ktsense.xml
@@ -0,0 +1,157 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>8</value></property>
+      <property><name>buffer_length_in_rotations</name><value>8</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>sliding_window_rotations</name><value>4</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>4</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_cg.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg.xml
new file mode 100644
index 0000000..202506c
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg.xml
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_cg_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..9dc0987
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_cg_unoptimized.xml
@@ -0,0 +1,140 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_sb.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb.xml
new file mode 100644
index 0000000..1df2f3e
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb.xml
@@ -0,0 +1,163 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_gpusense_sb_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..68bc150
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_gpusense_sb_unoptimized.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode0_realtime.xml b/gadgets/radial/config/fixed_radial_mode0_realtime.xml
new file mode 100644
index 0000000..9299321
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode0_realtime.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>0</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpu_ktsense.xml b/gadgets/radial/config/fixed_radial_mode1_gpu_ktsense.xml
new file mode 100644
index 0000000..17cbdd4
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpu_ktsense.xml
@@ -0,0 +1,157 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>sliding_window_rotations</name><value>2</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>2</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>2</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>2</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_cg.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg.xml
new file mode 100644
index 0000000..a4aae87
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg.xml
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_cg_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..12bc114
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_cg_unoptimized.xml
@@ -0,0 +1,140 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_sb.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb.xml
new file mode 100644
index 0000000..ba6e2e1
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb.xml
@@ -0,0 +1,163 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_gpusense_sb_unoptimized.xml b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..4c1a251
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_gpusense_sb_unoptimized.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/fixed_radial_mode1_realtime.xml b/gadgets/radial/config/fixed_radial_mode1_realtime.xml
new file mode 100644
index 0000000..3a4a0ff
--- /dev/null
+++ b/gadgets/radial/config/fixed_radial_mode1_realtime.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>1</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpu_ktsense.xml b/gadgets/radial/config/golden_radial_mode2_gpu_ktsense.xml
new file mode 100644
index 0000000..8d49c1b
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpu_ktsense.xml
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+      <property><name>sliding_window_rotations</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>25</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>16</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>25</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>16</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgKtSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgKtSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>25</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>rotations_to_discard</name>    <value>16</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_cg.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_cg.xml
new file mode 100644
index 0000000..683203c
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_cg.xml
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>16</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>16</value></property>
+      <property><name>buffer_length_in_rotations</name><value>2</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_cg_unoptimized.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..29bb68c
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_cg_unoptimized.xml
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>16</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>16</value></property>
+      <property><name>buffer_length_in_rotations</name><value>2</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_sb.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_sb.xml
new file mode 100644
index 0000000..dec345b
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_sb.xml
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_gpusense_sb_unoptimized.xml b/gadgets/radial/config/golden_radial_mode2_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..34289ca
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_gpusense_sb_unoptimized.xml
@@ -0,0 +1,152 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget_unoptimized</classname>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode2_realtime.xml b/gadgets/radial/config/golden_radial_mode2_realtime.xml
new file mode 100644
index 0000000..b2fc23e
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode2_realtime.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>8</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>2</value></property>
+      <property><name>buffer_length_in_rotations</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>profiles_per_frame</name><value>32</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuCgSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuCgSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuCgSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name><value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_iterations</name>    <value>5</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>kappa</name>                   <value>0.3</value></property>
+    </gadget>
+
+    <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/config/golden_radial_mode3_gpusense_sb.xml b/gadgets/radial/config/golden_radial_mode3_gpusense_sb.xml
new file mode 100644
index 0000000..bfbdeb0
--- /dev/null
+++ b/gadgets/radial/config/golden_radial_mode3_gpusense_sb.xml
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+         
+    <reader>
+      <slot>1008</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+    <writer>
+      <slot>1004</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+      <slot>1005</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+      <slot>1006</slot>
+      <dll>gadgetron_mricore</dll>
+      <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <gadget>
+      <name>NoiseAdjust</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>PCA</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>PCACoilGadget</classname>
+    </gadget>
+
+    <gadget>
+      <name>CoilReduction</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>CoilReductionGadget</classname>
+      <property><name>coils_out</name><value>16</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuRadialSensePrepGadget</name>
+      <dll>gadgetron_gpuradial</dll>
+      <classname>gpuRadialSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>mode</name><value>3</value></property>
+      <property><name>profiles_per_frame</name><value>16</value></property>
+      <property><name>rotations_per_reconstruction</name><value>32</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_frames_per_rotation</name><value>32</value></property>
+      <property><name>buffer_length_in_rotations</name><value>1</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+    </gadget>
+    
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice1</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>1</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+    <gadget>
+      <name>gpuSbSenseGadget_slice2</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>sliceno</name>                 <value>2</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>1.0</value></property>
+      <property><name>lambda</name>                  <value>2.0</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+
+     <gadget>
+      <name>Extract</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ExtractGadget</classname>
+    </gadget>
+
+    <!--
+	<gadget>
+	<name>ImageWrite</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageWriterGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>AutoScale</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>AutoScaleGadget</classname>
+    </gadget>
+        
+    <gadget>
+      <name>FloatToShort</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>FloatToUShortGadget</classname>
+    </gadget>
+    
+    <!--
+	<gadget>
+	<name>ImageFinishCPLX</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetCPLX</classname>
+	</gadget>
+    -->
+    
+    <!--
+	<gadget>
+	<name>ImageFinishFLOAT</name>
+	<dll>gadgetron_mricore</dll>
+	<classname>ImageFinishGadgetFLOAT</classname>
+	</gadget>
+    -->
+    
+    <gadget>
+      <name>ImageFinishUSHORT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+    
+</gadgetronStreamConfiguration>
diff --git a/gadgets/radial/gadgetron_radial_export.h b/gadgets/radial/gadgetron_radial_export.h
new file mode 100644
index 0000000..ee08991
--- /dev/null
+++ b/gadgets/radial/gadgetron_radial_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_RADIAL_EXPORT_H_
+#define GADGETRON_RADIAL_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_RADIAL__)
+#define EXPORTGADGETS_RADIAL __declspec(dllexport)
+#else
+#define EXPORTGADGETS_RADIAL __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_RADIAL
+#endif
+
+#endif /* GADGETRON_GPURADIAL_EXPORT_H_ */
diff --git a/gadgets/radial/gpuRadialSensePrepGadget.cpp b/gadgets/radial/gpuRadialSensePrepGadget.cpp
new file mode 100644
index 0000000..0b1a911
--- /dev/null
+++ b/gadgets/radial/gpuRadialSensePrepGadget.cpp
@@ -0,0 +1,998 @@
+#include "gpuRadialSensePrepGadget.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "SenseJob.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "check_CUDA.h"
+#include "radial_utilities.h"
+#include "hoNDArray_fileio.h"
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+namespace Gadgetron{
+
+  gpuRadialSensePrepGadget::gpuRadialSensePrepGadget()
+    : slices_(-1)
+    , sets_(-1)
+    , device_number_(-1)
+    , mode_(-1)
+    , samples_per_profile_(-1)
+  {
+    // Set some default values in case the config does not contain a specification
+    //
+
+    set_parameter(std::string("mode").c_str(), "0");
+    set_parameter(std::string("deviceno").c_str(), "0");
+    set_parameter(std::string("buffer_length_in_rotations").c_str(), "1");
+    set_parameter(std::string("buffer_using_solver").c_str(), "false");
+    set_parameter(std::string("buffer_convolution_kernel_width").c_str(), "5.5");
+    set_parameter(std::string("buffer_convolution_oversampling_factor").c_str(), "1.25");
+    set_parameter(std::string("rotations_per_reconstruction").c_str(), "0");
+    set_parameter(std::string("reconstruction_os_factor_x").c_str(), "1.0");
+    set_parameter(std::string("reconstruction_os_factor_y").c_str(), "1.0");
+  }
+  
+  gpuRadialSensePrepGadget::~gpuRadialSensePrepGadget() {}
+  
+  int gpuRadialSensePrepGadget::process_config(ACE_Message_Block* mb)
+  {
+    //GADGET_DEBUG1("gpuRadialSensePrepGadget::process_config\n");
+
+    // Get configuration values from config file
+    //
+
+    mode_ = get_int_value(std::string("mode").c_str());
+    device_number_ = get_int_value(std::string("deviceno").c_str());
+    rotations_per_reconstruction_ = get_int_value(std::string("rotations_per_reconstruction").c_str());
+    buffer_length_in_rotations_ = get_int_value(std::string("buffer_length_in_rotations").c_str());
+    buffer_using_solver_ = get_bool_value(std::string("buffer_using_solver").c_str());
+    output_timing_ = get_bool_value(std::string("output_timing").c_str());
+
+    // Currently there are some restrictions on the allowed sliding window configurations
+    //
+    
+    sliding_window_profiles_ = get_int_value(std::string("sliding_window_profiles").c_str());
+    sliding_window_rotations_ = get_int_value(std::string("sliding_window_rotations").c_str());
+
+    if( sliding_window_profiles_>0 && sliding_window_rotations_>0 ){
+      GADGET_DEBUG1( "Error: Sliding window reconstruction is not yet supported for both profiles and frames simultaneously.\n" );
+      return GADGET_FAIL;
+    }
+
+    if( sliding_window_profiles_>0 && rotations_per_reconstruction_>0 ){
+      GADGET_DEBUG1( "Error: Sliding window reconstruction over profiles is not yet supported for multiframe reconstructions.\n" );
+      return GADGET_FAIL;
+    }
+    
+    if( sliding_window_rotations_ > 0 && sliding_window_rotations_ >= rotations_per_reconstruction_ ){
+      GADGET_DEBUG1( "Error: Illegal sliding window configuration.\n" );
+      return GADGET_FAIL;
+    }
+
+    // Setup and validate device configuration
+    //
+
+    int number_of_devices;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GADGET_DEBUG1( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GADGET_DEBUG2("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    // Convolution kernel width and oversampling ratio (for the buffer)
+    //
+
+    kernel_width_ = get_double_value(std::string("buffer_convolution_kernel_width").c_str());
+    oversampling_factor_ = get_double_value(std::string("buffer_convolution_oversampling_factor").c_str());
+
+    // Get the Ismrmrd header
+    //
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+    
+    if( cfg.get() == 0x0 ){
+      GADGET_DEBUG1("Unable to parse Ismrmrd header\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    // Matrix sizes (as a multiple of the GPU's warp size)
+    //
+    
+    image_dimensions_.push_back(((e_space.matrixSize().x()+warp_size-1)/warp_size)*warp_size);
+    image_dimensions_.push_back(((e_space.matrixSize().y()+warp_size-1)/warp_size)*warp_size);
+
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize().x()*get_double_value(std::string("reconstruction_os_factor_x").c_str())))+warp_size-1)/warp_size)*warp_size);  
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize().y()*get_double_value(std::string("reconstruction_os_factor_y").c_str())))+warp_size-1)/warp_size)*warp_size);
+    
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]); 
+    
+    GADGET_DEBUG2("matrix_size_x : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[0], image_dimensions_recon_[0], image_dimensions_recon_os_[0]);
+
+    GADGET_DEBUG2("matrix_size_y : %d, recon: %d, recon_os: %d\n", 
+                  image_dimensions_[1], image_dimensions_recon_[1], image_dimensions_recon_os_[1]);
+    
+    fov_.push_back(r_space.fieldOfView_mm().x());
+    fov_.push_back(r_space.fieldOfView_mm().y());
+    fov_.push_back(r_space.fieldOfView_mm().z());
+
+    slices_ = e_limits.slice().present() ? e_limits.slice().get().maximum() + 1 : 1;
+    sets_ = e_limits.set().present() ? e_limits.set().get().maximum() + 1 : 1;
+    
+    // Allocate profile queues
+    // - one queue for the currently incoming frame
+    // - one queue for the next reconstruction
+
+    frame_profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_profiles_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    image_headers_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*image_dimensions_[0]*10;
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      frame_profiles_queue_[i].high_water_mark(bsize);
+      frame_profiles_queue_[i].low_water_mark(bsize);
+    }
+    
+    bsize *= (rotations_per_reconstruction_+1);
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      recon_profiles_queue_[i].high_water_mark(bsize);
+      recon_profiles_queue_[i].low_water_mark(bsize);
+    }
+
+    // Define some profile counters for book-keeping
+    //
+
+    previous_profile_ = boost::shared_array<long>(new long[slices_*sets_]);
+    image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+    profiles_counter_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    profiles_counter_global_= boost::shared_array<long>(new long[slices_*sets_]);
+    profiles_per_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_update_needed_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    reconfigure_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    num_coils_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    
+    if( !previous_profile_.get() ||
+        !image_counter_.get() || 
+        !profiles_counter_frame_.get() ||
+        !profiles_counter_global_.get() ||
+        !profiles_per_frame_.get() || 
+        !frames_per_rotation_.get() ||
+        !buffer_frames_per_rotation_.get() ||
+        !buffer_update_needed_.get() ||
+        !num_coils_.get() ||
+        !reconfigure_ ){
+      GADGET_DEBUG1("Failed to allocate host memory (1)\n");
+      return GADGET_FAIL;
+    }
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+
+      previous_profile_[i] = -1;
+      image_counter_[i] = 0;
+      profiles_counter_frame_[i] = 0;
+      profiles_counter_global_[i] = 0;
+      profiles_per_frame_[i] = get_int_value(std::string("profiles_per_frame").c_str());
+      frames_per_rotation_[i] = get_int_value(std::string("frames_per_rotation").c_str());
+      buffer_frames_per_rotation_[i] = get_int_value(std::string("buffer_frames_per_rotation").c_str());
+      num_coils_[i] = 0;
+      buffer_update_needed_[i] = true;
+      reconfigure_[i] = true;
+
+      // Assign some default values ("upper bound estimates") of the (possibly) unknown entities
+      //
+      
+      if( profiles_per_frame_[i] == 0 ){
+        profiles_per_frame_[i] = image_dimensions_[0];
+      }
+      
+      if( frames_per_rotation_[i] == 0 ){
+        if( mode_ == 2 || mode_ == 3 ) // golden ratio
+          frames_per_rotation_[i] = 1;
+        else
+          frames_per_rotation_[i] = image_dimensions_[0]/profiles_per_frame_[i];
+      }
+
+      bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*
+        std::max(1L, frames_per_rotation_[i]*rotations_per_reconstruction_);
+    
+      image_headers_queue_[i].high_water_mark(bsize);
+      image_headers_queue_[i].low_water_mark(bsize);
+    }
+        
+    position_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    read_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    phase_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    slice_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+
+    if( !position_.get() || !read_dir_.get() || !phase_dir_.get() || !slice_dir_.get() ){
+      GADGET_DEBUG1("Failed to allocate host memory (2)\n");
+      return GADGET_FAIL;
+    }
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      (position_[i])[0] = (position_[i])[1] = (position_[i])[2] = 0.0f;
+      (read_dir_[i])[0] = (read_dir_[i])[1] = (read_dir_[i])[2] = 0.0f;
+      (phase_dir_[i])[0] = (phase_dir_[i])[1] = (phase_dir_[i])[2] = 0.0f;
+      (slice_dir_[i])[0] = (slice_dir_[i])[1] = (slice_dir_[i])[2] = 0.0f;
+    }
+
+    // Allocate accumulation buffer
+    //
+
+    if( buffer_using_solver_ )
+      acc_buffer_cg_ = boost::shared_array< cuSenseBufferCg<float,2> >(new cuSenseBufferCg<float,2>[slices_*sets_]);
+    else
+      acc_buffer_ = boost::shared_array< cuSenseBuffer<float,2> >(new cuSenseBuffer<float,2>[slices_*sets_]);
+    
+    // Allocate remaining shared_arrays
+    //
+    
+    csm_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+    reg_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+
+    host_traj_recon_ = boost::shared_array< hoNDArray<floatd2> >(new hoNDArray<floatd2>[slices_*sets_]);
+    host_weights_recon_ = boost::shared_array< hoNDArray<float> >(new hoNDArray<float>[slices_*sets_]);
+
+    if( !csm_host_.get() || !reg_host_.get() || !host_traj_recon_.get() || !host_weights_recon_ ){
+      GADGET_DEBUG1("Failed to allocate host memory (3)\n");
+      return GADGET_FAIL;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuRadialSensePrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+          GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust (if in the gadget chain)
+    //
+    
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) { 
+      m1->release();
+      return GADGET_OK;
+    }
+
+    unsigned int profile = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+
+    // Get a pointer to the accumulation buffer. 
+    //
+
+    cuSenseBuffer<float,2> *acc_buffer = (buffer_using_solver_) ? &acc_buffer_cg_[set*slices_+slice] : &acc_buffer_[set*slices_+slice];
+
+    //GADGET_DEBUG1("gpuRadialSensePrepGadget::process\n");
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuRadialSensePrepGadget::process()") );
+
+    // Have the imaging plane changed?
+    //
+
+    if( !vec_equal(position_[set*slices_+slice], m1->getObjectPtr()->position) ||
+        !vec_equal(read_dir_[set*slices_+slice], m1->getObjectPtr()->read_dir) || 
+        !vec_equal(phase_dir_[set*slices_+slice], m1->getObjectPtr()->phase_dir) ||
+        !vec_equal(slice_dir_[set*slices_+slice], m1->getObjectPtr()->slice_dir) ){
+      
+      // Yes indeed, clear the accumulation buffer
+      acc_buffer->clear();
+      buffer_update_needed_[set*slices_+slice] = true;
+      
+      memcpy(position_[set*slices_+slice],m1->getObjectPtr()->position,3*sizeof(float));
+      memcpy(read_dir_[set*slices_+slice],m1->getObjectPtr()->read_dir,3*sizeof(float));
+      memcpy(phase_dir_[set*slices_+slice],m1->getObjectPtr()->phase_dir,3*sizeof(float));
+      memcpy(slice_dir_[set*slices_+slice],m1->getObjectPtr()->slice_dir,3*sizeof(float));
+    }
+    
+    // Only when the first profile arrives, do we know the #samples/profile
+    //
+
+    if( samples_per_profile_ == -1 )      
+      samples_per_profile_ = m1->getObjectPtr()->number_of_samples;
+    
+    if( samples_per_profile_ != m1->getObjectPtr()->number_of_samples ){
+      GADGET_DEBUG1("Unexpected change in the incoming profiles' lengths\n");
+      return GADGET_FAIL;
+    }
+    
+    bool new_frame_detected = false;
+
+    // Reconfigure at first pass
+    // - or if the number of coil changes
+    // - or if the reconfigure_ flag is set
+
+    if( num_coils_[set*slices_+slice] != m1->getObjectPtr()->active_channels ){
+      GADGET_DEBUG1("Reconfiguring due to change in the number of coils\n");
+      num_coils_[set*slices_+slice] = m1->getObjectPtr()->active_channels;
+      reconfigure(set, slice);
+    }
+
+    if( reconfigure_[set*slices_+slice] ){
+      GADGET_DEBUG1("Reconfiguring due to boolean indicator\n");
+      reconfigure(set, slice);
+    }
+
+    // Keep track of the incoming profile ids (mode dependent)
+    // - to determine the number of profiles per frame
+    // - to determine the number of frames per rotation
+    //
+
+    if (previous_profile_[set*slices_+slice] >= 0) {
+
+      if ( profile > previous_profile_[set*slices_+slice]) { // this is not the last profile in the frame
+        if( mode_ == 0 && get_int_value(std::string("frames_per_rotation").c_str()) == 0 ){
+          unsigned int acceleration_factor = profile - previous_profile_[set*slices_+slice];
+          if( acceleration_factor != frames_per_rotation_[set*slices_+slice] ){
+            GADGET_DEBUG1("Reconfiguring due to change in acceleration factor\n");
+            frames_per_rotation_[set*slices_+slice] = acceleration_factor;
+            reconfigure(set, slice);
+          }
+        }
+      }
+      else{ // This is the first profile in a new frame
+        if( get_int_value(std::string("profiles_per_frame").c_str()) == 0 && // make sure the user did not specify a desired value for this variable
+            profiles_counter_frame_[set*slices_+slice] > 0 &&
+            profiles_counter_frame_[set*slices_+slice] != profiles_per_frame_[set*slices_+slice] ){ // a new acceleration factor is detected
+          GADGET_DEBUG1("Reconfiguring due to new slice detection\n");
+          new_frame_detected = true;
+          profiles_per_frame_[set*slices_+slice] = profiles_counter_frame_[set*slices_+slice];
+          if( mode_ == 1 && get_int_value(std::string("frames_per_rotation").c_str()) == 0 )
+            frames_per_rotation_[set*slices_+slice] = image_dimensions_[0]/profiles_per_frame_[set*slices_+slice];
+          reconfigure(set, slice);
+        }
+      }
+    }
+    previous_profile_[set*slices_+slice] = profile;
+
+    // Enqueue profile
+    // - if 'new_frame_detected' the current profile does not belong to the current frame and we delay enqueing
+
+    if( !new_frame_detected ) {
+      
+      // Memory handling is easier if we make copies for our internal queues
+      frame_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+      recon_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+    }
+
+    // If the profile is the last of a "true frame" (ignoring any sliding window profiles)
+    // - then update the accumulation buffer
+
+    bool is_last_profile_in_frame = (profiles_counter_frame_[set*slices_+slice] == profiles_per_frame_[set*slices_+slice]-1);
+    is_last_profile_in_frame |= new_frame_detected;
+
+    if( is_last_profile_in_frame ){
+
+      // Extract this frame's samples to update the csm/regularization buffer
+      //
+
+      boost::shared_ptr< hoNDArray<float_complext> > host_samples = 
+        extract_samples_from_queue( &frame_profiles_queue_[set*slices_+slice], false, set, slice );
+
+      if( host_samples.get() == 0x0 ){
+        GADGET_DEBUG1("Failed to extract frame data from queue\n");
+        return GADGET_FAIL;
+      }
+      
+      cuNDArray<float_complext> samples( host_samples.get() );
+      
+      long profile_offset = profiles_counter_global_[set*slices_+slice] - ((new_frame_detected) ? 1 : 0);
+      boost::shared_ptr< cuNDArray<floatd2> > traj = calculate_trajectory_for_frame(profile_offset, set, slice);
+
+      buffer_update_needed_[set*slices_+slice] |= acc_buffer->add_frame_data( &samples, traj.get() );
+    }
+    
+    // Are we ready to reconstruct (downstream)?
+    //
+    
+    long profiles_per_reconstruction = profiles_per_frame_[set*slices_+slice];
+    
+    if( rotations_per_reconstruction_ > 0 )
+      profiles_per_reconstruction *= (frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_);
+    
+    bool is_last_profile_in_reconstruction = ( recon_profiles_queue_[set*slices_+slice].message_count() == profiles_per_reconstruction );
+        
+    // Prepare the image header for this frame
+    // - if this is indeed the last profile of a new frame
+    // - or if we are about to reconstruct due to 'sliding_window_profiles_' > 0
+
+    if( is_last_profile_in_frame || 
+        (is_last_profile_in_reconstruction && image_headers_queue_[set*slices_+slice].message_count() == 0) ){
+      
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *header = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+      ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+
+      {
+        // Initialize header to all zeroes (there is a few fields we do not set yet)
+        ISMRMRD::ImageHeader tmp = {0};
+        *(header->getObjectPtr()) = tmp;
+      }
+
+      header->getObjectPtr()->version = base_head->version;
+
+      header->getObjectPtr()->matrix_size[0] = image_dimensions_recon_[0];
+      header->getObjectPtr()->matrix_size[1] = image_dimensions_recon_[1];
+      header->getObjectPtr()->matrix_size[2] = std::max(1L,frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_);
+
+      header->getObjectPtr()->field_of_view[0] = fov_[0];
+      header->getObjectPtr()->field_of_view[1] = fov_[1];
+      header->getObjectPtr()->field_of_view[2] = fov_[2];
+
+      header->getObjectPtr()->channels = num_coils_[set*slices_+slice];
+      header->getObjectPtr()->slice = base_head->idx.slice;
+      header->getObjectPtr()->set = base_head->idx.set;
+
+      header->getObjectPtr()->acquisition_time_stamp = base_head->acquisition_time_stamp;
+      memcpy(header->getObjectPtr()->physiology_time_stamp, base_head->physiology_time_stamp, sizeof(uint32_t)*ISMRMRD_PHYS_STAMPS);
+
+      memcpy(header->getObjectPtr()->position, base_head->position, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->read_dir, base_head->read_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->phase_dir, base_head->phase_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->slice_dir, base_head->slice_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+
+      header->getObjectPtr()->image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+      header->getObjectPtr()->image_index = image_counter_[set*slices_+slice]++; 
+      header->getObjectPtr()->image_series_index = set*slices_+slice;
+
+      image_headers_queue_[set*slices_+slice].enqueue_tail(header);
+    }
+    
+    // If it is time to reconstruct (downstream) then prepare the Sense job
+    // 
+
+    if( is_last_profile_in_reconstruction ){
+      
+      // Update csm and regularization images if the buffer has changed (completed a cycle) 
+      // - and at the first pass
+      
+      if( buffer_update_needed_[set*slices_+slice] || 
+          csm_host_[set*slices_+slice].get_number_of_elements() == 0 || 
+          reg_host_[set*slices_+slice].get_number_of_elements() == 0 ){
+
+        // Get the accumulated coil images
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > csm_data = acc_buffer->get_accumulated_coil_images();
+
+        if( !csm_data.get() ){
+          GADGET_DEBUG1("Error during accumulation buffer computation\n");
+          return GADGET_FAIL;
+        }            
+	
+        // Estimate CSM
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > csm = estimate_b1_map<float,2>( csm_data.get() );
+
+        if( !csm.get() ){
+          GADGET_DEBUG1("Error during coil estimation\n");
+          return GADGET_FAIL;
+        }            
+
+        acc_buffer->set_csm(csm);
+        csm_host_[set*slices_+slice] = *(csm->to_host());
+	
+        // Compute regularization image
+        //
+
+        boost::shared_ptr< cuNDArray<float_complext> > reg_image;
+	
+        if( buffer_using_solver_ && ( mode_ == 2 || mode_ == 3 ) ){
+          ((cuSenseBufferCg<float,2>*)acc_buffer)->preprocess
+            ( calculate_trajectory_for_rhs( profiles_counter_global_[set*slices_+slice] - ((new_frame_detected) ? 1 : 0), set, slice).get());
+        }
+
+        reg_image = acc_buffer->get_combined_coil_image();
+	
+        if( !reg_image.get() ){
+          GADGET_DEBUG1("Error computing regularization image\n");
+          return GADGET_FAIL;
+        }            
+	
+        reg_host_[set*slices_+slice] = *(reg_image->to_host());
+		
+        /*
+          static int counter = 0;
+          char filename[256];
+          sprintf((char*)filename, "reg_%d.cplx", counter);
+          write_nd_array<float_complext>( reg_host_[set*slices_+slice].get(), filename );
+          counter++; */
+
+        buffer_update_needed_[set*slices_+slice] = false;
+      }
+
+      // Prepare data array of the profiles for the downstream reconstruction
+      //
+      
+      boost::shared_ptr< hoNDArray<float_complext> > samples_host = 
+        extract_samples_from_queue( &recon_profiles_queue_[set*slices_+slice], true, set, slice );
+      
+      if( samples_host.get() == 0x0 ){
+        GADGET_DEBUG1("Failed to extract frame data from queue\n");
+        return GADGET_FAIL;
+      }
+           
+      // The trajectory needs to be updated on the fly:
+      // - for golden ratio based acquisitions
+      // - when we are reconstructing frame-by-frame
+      
+      if( mode_ == 2 || mode_ == 3 || rotations_per_reconstruction_ == 0 ){
+        calculate_trajectory_for_reconstruction
+          ( profiles_counter_global_[set*slices_+slice] - ((new_frame_detected) ? 1 : 0), set, slice );
+      }
+      
+      // Set up Sense job
+      //
+
+      GadgetContainerMessage< SenseJob >* m4 = new GadgetContainerMessage< SenseJob >();
+	
+      m4->getObjectPtr()->dat_host_ = samples_host;
+      m4->getObjectPtr()->tra_host_ = boost::shared_ptr< hoNDArray<floatd2> >(new hoNDArray<floatd2>(host_traj_recon_[set*slices_+slice]));
+      m4->getObjectPtr()->dcw_host_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>(host_weights_recon_[set*slices_+slice]));
+      m4->getObjectPtr()->csm_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(csm_host_[set*slices_+slice]));
+      m4->getObjectPtr()->reg_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(reg_host_[set*slices_+slice]));
+
+      // Pull the image headers out of the queue
+      //
+
+      long frames_per_reconstruction = 
+        std::max( 1L, frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_ );
+      
+      if( image_headers_queue_[set*slices_+slice].message_count() != frames_per_reconstruction ){
+        m4->release();
+        GADGET_DEBUG2("Unexpected size of image header queue: %d, %d\n", 
+                      image_headers_queue_[set*slices_+slice].message_count(), frames_per_reconstruction);
+        return GADGET_FAIL;
+      }
+
+      m4->getObjectPtr()->image_headers_ =
+        boost::shared_array<ISMRMRD::ImageHeader>( new ISMRMRD::ImageHeader[frames_per_reconstruction] );
+      
+      for( unsigned int i=0; i<frames_per_reconstruction; i++ ){	
+
+        ACE_Message_Block *mbq;
+
+        if( image_headers_queue_[set*slices_+slice].dequeue_head(mbq) < 0 ) {
+          m4->release();
+          GADGET_DEBUG1("Image header dequeue failed\n");
+          return GADGET_FAIL;
+        }
+	
+        GadgetContainerMessage<ISMRMRD::ImageHeader> *m = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+        m4->getObjectPtr()->image_headers_[i] = *m->getObjectPtr();
+
+        // In sliding window mode the header might need to go back at the end of the queue for reuse
+        // 
+	
+        if( i >= frames_per_reconstruction-sliding_window_rotations_*frames_per_rotation_[set*slices_+slice] ){
+          image_headers_queue_[set*slices_+slice].enqueue_tail(m);
+        }
+        else {
+          m->release();
+        }
+      }      
+      
+      // The Sense Job needs an image header as well. 
+      // Let us just copy the initial one...
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m3 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+      *m3->getObjectPtr() = m4->getObjectPtr()->image_headers_[0];
+      m3->cont(m4);
+      
+      //GADGET_DEBUG1("Putting job on queue\n");
+      
+      if (this->next()->putq(m3) < 0) {
+        GADGET_DEBUG1("Failed to put job on queue.\n");
+        m3->release();
+        return GADGET_FAIL;
+      }
+    }
+    
+    if( is_last_profile_in_frame )
+      profiles_counter_frame_[set*slices_+slice] = 0;
+    else{
+      profiles_counter_frame_[set*slices_+slice]++;
+    }
+
+    if( new_frame_detected ){
+
+      // This is the first profile of the next frame, enqueue.
+      // We have encountered deadlocks if the same profile is enqueued twice in different queues. Hence the copy.
+      
+      frame_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2));
+      recon_profiles_queue_[set*slices_+slice].enqueue_tail(duplicate_profile(m2)); 
+
+      profiles_counter_frame_[set*slices_+slice]++;
+    }
+
+    profiles_counter_global_[set*slices_+slice]++;
+
+    if( output_timing_ )
+      process_timer.reset();
+    
+    m1->release(); // the internal queues hold copies
+    return GADGET_OK;
+  }
+  
+  int 
+  gpuRadialSensePrepGadget::calculate_trajectory_for_reconstruction(long profile_offset, unsigned int set, unsigned int slice)
+  {   
+    //GADGET_DEBUG1("Calculating trajectory for reconstruction\n");
+
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      {
+        if( rotations_per_reconstruction_ == 0 ){
+
+          long local_frame = (profile_offset/profiles_per_frame_[set*slices_+slice])%frames_per_rotation_[set*slices_+slice];
+          float angular_offset = M_PI/float(profiles_per_frame_[set*slices_+slice])*float(local_frame)/float(frames_per_rotation_[set*slices_+slice]);	  
+
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_fixed_angle_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, angular_offset )->to_host();	
+        }
+        else{
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_fixed_angle_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], frames_per_rotation_[set*slices_+slice] )->to_host();
+        }
+      }
+      break;
+      
+    case 2:
+    case 3:
+      {
+        if( rotations_per_reconstruction_ == 0 ){	  
+          unsigned int first_profile_in_reconstruction = std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]+1);
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_golden_ratio_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, first_profile_in_reconstruction,
+              (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();	
+        }
+        else{
+          unsigned int first_profile_in_reconstruction = 
+            std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_+1);
+          host_traj_recon_[set*slices_+slice] = *compute_radial_trajectory_golden_ratio_2d<float>
+            ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 
+              frames_per_rotation_[set*slices_+slice]*rotations_per_reconstruction_, first_profile_in_reconstruction,
+              (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();
+        }	  
+      }
+      break;
+	
+    default:
+      GADGET_DEBUG1("Illegal trajectory mode\n");
+      return GADGET_FAIL;
+      break;
+    }
+    return GADGET_OK;
+  }  
+
+  int
+  gpuRadialSensePrepGadget::calculate_density_compensation_for_reconstruction( unsigned int set, unsigned int slice)
+  {
+    //GADGET_DEBUG1("Calculating dcw for reconstruction\n");
+    
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      host_weights_recon_[set*slices_+slice] = *compute_radial_dcw_fixed_angle_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])) )->to_host();
+      break;
+      
+    case 2:
+    case 3:
+      host_weights_recon_[set*slices_+slice] = *compute_radial_dcw_golden_ratio_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])),0,
+          (mode_==2) ? GR_ORIGINAL : GR_SMALLEST )->to_host();
+      break;
+      
+    default:
+      GADGET_DEBUG1("Illegal dcw mode\n");
+      return GADGET_FAIL;
+      break;
+    }
+    return GADGET_OK;
+  }
+  
+  boost::shared_ptr< cuNDArray<floatd2> > 
+  gpuRadialSensePrepGadget::calculate_trajectory_for_frame(long profile_offset, unsigned int set, unsigned int slice)
+  {
+    //GADGET_DEBUG1("Calculating trajectory for buffer frame\n");
+
+    boost::shared_ptr< cuNDArray<floatd2> > result;
+
+    switch(mode_){
+
+    case 0:
+    case 1:
+      {
+        long local_frame = (profile_offset/profiles_per_frame_[set*slices_+slice])%frames_per_rotation_[set*slices_+slice];
+        float angular_offset = M_PI/float(profiles_per_frame_[set*slices_+slice])*float(local_frame)/float(frames_per_rotation_[set*slices_+slice]);	  
+
+        result = compute_radial_trajectory_fixed_angle_2d<float>
+          ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, angular_offset );  
+      }
+      break;
+	
+    case 2:
+    case 3:
+      { 
+        unsigned int first_profile_in_buffer = std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]+1);
+        result = compute_radial_trajectory_golden_ratio_2d<float>
+          ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], 1, first_profile_in_buffer,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;	
+	
+    default:
+      GADGET_DEBUG1("Illegal trajectory mode\n");
+      break;
+    }
+    
+    return result;
+  }
+
+  boost::shared_ptr< cuNDArray<float> >
+  gpuRadialSensePrepGadget::calculate_density_compensation_for_frame(unsigned int set, unsigned int slice)
+  {    
+    //GADGET_DEBUG1("Calculating dcw for buffer frame\n");
+
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      return compute_radial_dcw_fixed_angle_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])) );
+      break;
+      
+    case 2:
+    case 3:
+      return compute_radial_dcw_golden_ratio_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice], oversampling_factor_, 
+          1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])),0,
+          (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      break;
+      
+    default:
+      GADGET_DEBUG1("Illegal dcw mode\n");
+      return boost::shared_ptr< cuNDArray<float> >();
+      break;
+    }   
+  }
+
+
+  boost::shared_ptr< cuNDArray<floatd2> > 
+  gpuRadialSensePrepGadget::calculate_trajectory_for_rhs(long profile_offset, unsigned int set, unsigned int slice)
+  {
+    //GADGET_DEBUG1("Calculating trajectory for rhs\n");
+
+    switch(mode_){
+
+    case 0:
+    case 1:
+      return compute_radial_trajectory_fixed_angle_2d<float>
+        ( samples_per_profile_, profiles_per_frame_[set*slices_+slice]*buffer_frames_per_rotation_[set*slices_+slice], 1 );
+      break;
+	
+    case 2:
+    case 3:
+      { 
+        unsigned int first_profile = 
+          std::max(0L, profile_offset-profiles_per_frame_[set*slices_+slice]*
+                   buffer_frames_per_rotation_[set*slices_+slice]*
+                   buffer_length_in_rotations_+1);
+
+        return compute_radial_trajectory_golden_ratio_2d<float>
+          ( samples_per_profile_, 
+            profiles_per_frame_[set*slices_+slice]*
+            buffer_frames_per_rotation_[set*slices_+slice]*buffer_length_in_rotations_, 
+            1, first_profile,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;	
+	
+    default:
+      GADGET_DEBUG1("Illegal trajectory mode\n");
+      return boost::shared_ptr< cuNDArray<floatd2> >();
+      break;
+    }
+  }
+  
+  boost::shared_ptr< cuNDArray<float> >
+  gpuRadialSensePrepGadget::calculate_density_compensation_for_rhs(unsigned int set, unsigned int slice)
+  {
+    //GADGET_DEBUG1("Calculating dcw for rhs\n");
+    
+    switch(mode_){
+      
+    case 0:
+    case 1:
+      {
+        unsigned int num_profiles = 
+          profiles_per_frame_[set*slices_+slice]*buffer_frames_per_rotation_[set*slices_+slice];
+
+        return compute_radial_dcw_fixed_angle_2d<float>
+          ( samples_per_profile_, num_profiles, oversampling_factor_, 
+            1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])) );
+      }
+      break;
+      
+    case 2:
+    case 3:
+      {
+        unsigned int num_profiles = 
+          profiles_per_frame_[set*slices_+slice]*buffer_frames_per_rotation_[set*slices_+slice]*buffer_length_in_rotations_;
+
+        return compute_radial_dcw_golden_ratio_2d<float>
+          ( samples_per_profile_, num_profiles, oversampling_factor_, 
+            1.0f/(float(samples_per_profile_)/float(image_dimensions_recon_[0])),0,
+            (mode_==2) ? GR_ORIGINAL : GR_SMALLEST );
+      }
+      break;
+      
+    default:
+      GADGET_DEBUG1("Illegal dcw mode\n");
+      return boost::shared_ptr< cuNDArray<float> >();
+      break;
+    }
+  }
+
+  boost::shared_ptr< hoNDArray<float_complext> > gpuRadialSensePrepGadget::
+  extract_samples_from_queue( ACE_Message_Queue<ACE_MT_SYNCH> *queue, bool sliding_window,
+                              unsigned int set, unsigned int slice )
+  {    
+    //GADGET_DEBUG1("Emptying queue...\n");
+
+    unsigned int profiles_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(samples_per_profile_*profiles_buffered);
+    dims.push_back(num_coils_[set*slices_+slice]);
+    
+    boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(&dims));
+    
+    for (unsigned int p=0; p<profiles_buffered; p++) {
+
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+        GADGET_DEBUG1("Message dequeue failed\n");
+        return boost::shared_ptr< hoNDArray<float_complext> >();
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+        GADGET_DEBUG1("Unable to interpret data on message queue\n");
+        return boost::shared_ptr< hoNDArray<float_complext> >();
+      }
+	
+      for (unsigned int c = 0; c < num_coils_[set*slices_+slice]; c++) {
+	
+        float_complext *data_ptr = host_samples->get_data_ptr();
+        data_ptr += c*samples_per_profile_*profiles_buffered+p*samples_per_profile_;
+	    
+        std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+        r_ptr += c*daq->getObjectPtr()->get_size(0);
+	  
+        memcpy(data_ptr,r_ptr,samples_per_profile_*sizeof(float_complext));
+      }
+
+      // In sliding window mode the profile might need to go back at the end of the queue
+      // 
+      
+      long profiles_in_sliding_window = sliding_window_profiles_ + 
+        profiles_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*sliding_window_rotations_;
+
+      if( sliding_window && p >= (profiles_buffered-profiles_in_sliding_window) )
+        queue->enqueue_tail(mbq);
+      else
+        mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  GadgetContainerMessage< hoNDArray< std::complex<float> > >*
+  gpuRadialSensePrepGadget::duplicate_profile( GadgetContainerMessage< hoNDArray< std::complex<float> > > *profile )
+  {
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *copy = 
+      new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    
+    *copy->getObjectPtr() = *profile->getObjectPtr();
+    
+    return copy;
+  }
+
+  void gpuRadialSensePrepGadget::reconfigure(unsigned int set, unsigned int slice)
+  {    
+    GADGET_DEBUG2("\nReconfiguring:\n#profiles/frame:%d\n#frames/rotation: %d\n#rotations/reconstruction:%d\n", 
+                  profiles_per_frame_[set*slices_+slice], frames_per_rotation_[set*slices_+slice], rotations_per_reconstruction_);
+
+    calculate_trajectory_for_reconstruction(0, set, slice);
+    calculate_density_compensation_for_reconstruction(set, slice);
+    
+    buffer_frames_per_rotation_[set*slices_+slice] = get_int_value(std::string("buffer_frames_per_rotation").c_str());
+
+    if( buffer_frames_per_rotation_[set*slices_+slice] == 0 ){
+      if( mode_ == 2 || mode_ == 3 )
+        buffer_frames_per_rotation_[set*slices_+slice] = 
+          image_dimensions_recon_os_[0]/profiles_per_frame_[set*slices_+slice];
+      else
+        buffer_frames_per_rotation_[set*slices_+slice] = frames_per_rotation_[set*slices_+slice];
+    }
+    
+    cuSenseBuffer<float,2> *acc_buffer = (buffer_using_solver_) ? &acc_buffer_cg_[set*slices_+slice] : &acc_buffer_[set*slices_+slice];
+
+    acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+                       kernel_width_, num_coils_[set*slices_+slice], 
+                       buffer_length_in_rotations_, buffer_frames_per_rotation_[set*slices_+slice] );
+    
+    boost::shared_ptr< cuNDArray<float> > device_weights_frame = calculate_density_compensation_for_frame(set, slice);
+    acc_buffer->set_dcw(device_weights_frame);
+
+    if( buffer_using_solver_ ){
+      ((cuSenseBufferCg<float,2>*) acc_buffer)->set_dcw_for_rhs(calculate_density_compensation_for_rhs(set, slice));
+      ((cuSenseBufferCg<float,2>*) acc_buffer)->preprocess(calculate_trajectory_for_rhs(0, set, slice).get());
+    }
+    
+    reconfigure_[set*slices_+slice] = false;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuRadialSensePrepGadget)
+}
diff --git a/gadgets/radial/gpuRadialSensePrepGadget.h b/gadgets/radial/gpuRadialSensePrepGadget.h
new file mode 100644
index 0000000..7f67b69
--- /dev/null
+++ b/gadgets/radial/gpuRadialSensePrepGadget.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include "gadgetron_radial_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+#include "cuCgPreconditioner.h"
+#include "cuSenseBufferCg.h"
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+/*
+  ------------------------------------------
+  Trajectory modes for radial reconstruction
+  ------------------------------------------
+  
+  Mode 0 and Mode 1 are variants of 'fixed' radial trajectories with interframe rotation.
+  Mode 2 and Mode 3 denote radial trajectories with golden ratio based angular profile spacings.
+  
+  Let 
+  'i' denote the number of profiles per (undersampled) frame
+  'j' denote the number of frames per trajectory rotation (to obtain a fully sampled acquisition)
+  'h' denote a variable of type ISMRMRD::AcquisitionHeader
+
+  It is possible to explicitly set 'i' and 'j' in the Gadgetron configuration file.
+  For some modes this is (partly) required, 
+  for others they will be automatically determined from the incoming profile headers.
+  
+  Mode 0:
+  -------
+  For each rotation cycle profiles are numbered using the scheme
+
+    0+0*j,0+1*j,0+2*j,...,0+(i-1)*j, (1st frame)
+    1+0*j,1+1*j,1+2*j,...,1+(i-1)*j, (2nd frame)
+    2+0*j,2+1*j,2+2*j,...,2+(i-1)*j, (3rd frame)
+    ...,
+    (j-1)+0*j,(j-1)+1*j,(j-1)+2*j,...,(j-1)+(i-1)*j
+
+  as given in h.idx.kspace_encode_step_1.
+  Both 'i' and 'j' are automatically derived and thus need not be explicitly specified in a configuration file.
+  For mode 0 both 'i' and 'j' can be changed dynamically as desired e.g. for real-time imaging.
+
+  Mode 1:
+  -------
+  Profiles are numbered 0,1,2,...,i-1, 0,1,2,...,i-1, ... as given in h.idx.kspace_encode_step_1.
+  'j' is estimated as 'matrix_size'/'i' and should be explicitly set in the configuration file if this is not the case, e.g.:
+  <property><name>frames_per_rotation</name><value>8</value></property>
+      
+
+  Mode 2 and Mode 3:
+  -------
+  Profiles are numbered 
+  0,1,2,...,i-1, 0,1,2,...,i-1, 0,1,2,...,i-1, ...
+  or
+  0,1,2,...,i-1, i,i+1,i+2,...,2*i-1, 2*i,2*i+1,2*i+2,3*i-1, ...
+  as given in h.idx.kspace_encode_step_1.
+  'i' should be explicitly specified in the Gadgetron configuration file, e.g.:
+  <property><name>profiles_per_frame</name><value>32</value></property>
+  If not it defaults to i=32.
+  'j' is explicitly set to '1' even if specified in the configuration file.
+*/
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_RADIAL gpuRadialSensePrepGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(gpuRadialSensePrepGadget);
+
+    gpuRadialSensePrepGadget();
+    virtual ~gpuRadialSensePrepGadget();
+
+  protected:
+    
+    virtual int process_config(ACE_Message_Block *mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2);
+
+  private:
+
+    inline bool vec_equal(float *in1, float *in2) {
+      for (unsigned int i = 0; i < 3; i++) {
+	if (in1[i] != in2[i]) return false;
+      }
+      return true;
+    }
+    
+    boost::shared_array<bool> reconfigure_;
+    virtual void reconfigure(unsigned int set, unsigned int slice);
+
+    GadgetContainerMessage< hoNDArray< std::complex<float> > >*
+      duplicate_profile( GadgetContainerMessage< hoNDArray< std::complex<float> > > *profile );
+
+    boost::shared_ptr< hoNDArray<float_complext> > 
+      extract_samples_from_queue( ACE_Message_Queue<ACE_MT_SYNCH> *queue,
+				  bool acknowledge_sliding_window,
+				  unsigned int set, unsigned int slice );
+
+    // Compute trajectory/dcw for a reconstruction (to store internally)
+    //
+
+    int calculate_trajectory_for_reconstruction(long profile_offset, unsigned int set, unsigned int slice);
+    int calculate_density_compensation_for_reconstruction(unsigned int set, unsigned int slice);
+
+    // Compute trajectory/dcw for adding (usually undersampled) frames to the accumulation buffer
+    //
+
+    boost::shared_ptr< cuNDArray<floatd2> > 
+      calculate_trajectory_for_frame(long profile_offset, unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<float> >
+      calculate_density_compensation_for_frame(unsigned int set, unsigned int slice);
+
+    // Compute trajectory/dcw for the fully sampled accumulation buffer (iterative buffer mode only)
+    //
+
+    boost::shared_ptr< cuNDArray<floatd2> > 
+      calculate_trajectory_for_rhs(long profile_offset, unsigned int set, unsigned int slice);
+
+    boost::shared_ptr< cuNDArray<float> > 
+      calculate_density_compensation_for_rhs(unsigned int set, unsigned int slice);
+
+    int slices_;
+    int sets_;
+    int device_number_;
+    int mode_; // See note above
+    long samples_per_profile_;
+
+    boost::shared_array<long> image_counter_;
+    boost::shared_array<long> profiles_per_frame_;  // for an undersampled frame
+    boost::shared_array<long> frames_per_rotation_; // representing a fully sampled frame
+
+    // The number of rotations to batch per reconstruction. 
+    // Set to '0' to reconstruct frames individually.
+    long rotations_per_reconstruction_; 
+
+    // The number of buffer cycles
+    long buffer_length_in_rotations_; 
+
+    boost::shared_array<long> buffer_frames_per_rotation_; // the number of buffer subcycles
+
+    // Internal book-keping
+    boost::shared_array<long> previous_profile_;
+    boost::shared_array<long> profiles_counter_frame_;
+    boost::shared_array<long> profiles_counter_global_;
+
+    long sliding_window_profiles_;
+    long sliding_window_rotations_;
+
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_array<unsigned int> num_coils_;
+
+    boost::shared_array<float[3]> position_;
+    boost::shared_array<float[3]> read_dir_;
+    boost::shared_array<float[3]> phase_dir_;
+    boost::shared_array<float[3]> slice_dir_;
+
+    bool output_timing_;
+    bool buffer_using_solver_;
+
+    boost::shared_array<bool> buffer_update_needed_;
+
+    boost::shared_array< hoNDArray<floatd2> > host_traj_recon_;
+    boost::shared_array< hoNDArray<float> > host_weights_recon_;
+    
+    boost::shared_array< hoNDArray<float_complext> > csm_host_;
+    boost::shared_array< hoNDArray<float_complext> > reg_host_;
+    
+    boost::shared_array< cuSenseBuffer<float,2> > acc_buffer_;
+    boost::shared_array< cuSenseBufferCg<float,2> > acc_buffer_cg_;
+
+    std::vector<size_t> fov_;
+    std::vector<size_t> image_dimensions_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > frame_profiles_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_profiles_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > image_headers_queue_;
+  };
+}
diff --git a/gadgets/sense/CMakeLists.txt b/gadgets/sense/CMakeLists.txt
new file mode 100644
index 0000000..3728dad
--- /dev/null
+++ b/gadgets/sense/CMakeLists.txt
@@ -0,0 +1,43 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUSENSE__)
+endif (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+
+include_directories(   
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/sense
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${HDF5_INCLUDE_DIR}
+  ${HDF5_INCLUDE_DIR}/cpp
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+)
+
+add_library(gadgetron_gpusense SHARED 
+  gpuCgSenseGadget.cpp 
+  gpuCgKtSenseGadget.cpp 
+  gpuSbSenseGadget.cpp 
+  gpuGenericSensePrepGadget.cpp
+  ${ISMRMRD_XSD_SOURCE}
+  )
+
+target_link_libraries(gadgetron_gpusense 
+  cpucore gpucore gpusolvers gpuoperators gpuparallelmri 
+  ${Boost_LIBRARIES} ${ISMRMRD_LIBRARIES} ${XERCESC_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES} 
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+install (TARGETS gadgetron_gpusense DESTINATION lib)
+
+install (FILES 
+  SenseJob.h
+  DESTINATION include)
+
+add_subdirectory(config)
diff --git a/gadgets/sense/SenseJob.h b/gadgets/sense/SenseJob.h
new file mode 100644
index 0000000..169a6ec
--- /dev/null
+++ b/gadgets/sense/SenseJob.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "vector_td.h"
+
+#include <ismrmrd.h>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+  
+  class SenseJob
+  {
+  public:
+    
+    SenseJob() {}
+    ~SenseJob() {}
+
+    boost::shared_array<ISMRMRD::ImageHeader> image_headers_;
+
+    boost::shared_ptr< hoNDArray<float_complext> >  dat_host_;
+    boost::shared_ptr< hoNDArray<floatd2>        >  tra_host_;
+    boost::shared_ptr< hoNDArray<float>          >  dcw_host_;
+    boost::shared_ptr< hoNDArray<float_complext> >  csm_host_;
+    boost::shared_ptr< hoNDArray<float_complext> >  reg_host_;
+  };
+}
diff --git a/gadgets/sense/config/CMakeLists.txt b/gadgets/sense/config/CMakeLists.txt
new file mode 100644
index 0000000..fec681a
--- /dev/null
+++ b/gadgets/sense/config/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (ARMADILLO_FOUND)
+  install (FILES 
+    generic_gpusense_cg.xml 
+    generic_gpusense_cg_singleshot.xml 
+    generic_gpusense_sb_singleshot.xml 
+    generic_gpu_ktsense_singleshot.xml 
+    DESTINATION config)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, only unoptimized generic trajectory config files will be available")
+endif (ARMADILLO_FOUND)
+
+#install (FILES 
+#  generic_gpusense_cg_unoptimized.xml 
+#  generic_gpusense_sb_unoptimized.xml 
+#  DESTINATION config)
diff --git a/gadgets/sense/config/generic_gpu_ktsense_singleshot.xml b/gadgets/sense/config/generic_gpu_ktsense_singleshot.xml
new file mode 100644
index 0000000..f3fe738
--- /dev/null
+++ b/gadgets/sense/config/generic_gpu_ktsense_singleshot.xml
@@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgKtSenseGadget_slice0</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgKtSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name><value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>50</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name>      <value>true</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/sense/config/generic_gpusense_cg.xml b/gadgets/sense/config/generic_gpusense_cg.xml
new file mode 100644
index 0000000..f2e2975
--- /dev/null
+++ b/gadgets/sense/config/generic_gpusense_cg.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>rotations_per_reconstruction</name><value>4</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>30</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/sense/config/generic_gpusense_cg_singleshot.xml b/gadgets/sense/config/generic_gpusense_cg_singleshot.xml
new file mode 100644
index 0000000..acd38e2
--- /dev/null
+++ b/gadgets/sense/config/generic_gpusense_cg_singleshot.xml
@@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>30</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/sense/config/generic_gpusense_sb_singleshot.xml b/gadgets/sense/config/generic_gpusense_sb_singleshot.xml
new file mode 100644
index 0000000..5dbae87
--- /dev/null
+++ b/gadgets/sense/config/generic_gpusense_sb_singleshot.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+      <property><name>deviceno</name><value>0</value></property>
+      <property><name>readouts_per_frame</name><value>1</value></property>
+      <property><name>frames_per_rotation</name><value>1</value></property>
+      <property><name>rotations_per_reconstruction</name><value>50</value></property>
+      <property><name>buffer_using_solver</name><value>true</value></property>
+      <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+      <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+      <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+      <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+    <gadget>
+      <name>gpuSbSenseGadget_slice0</name>
+      <dll>gadgetron_gpusense</dll>
+      <classname>gpuSbSenseGadget</classname>
+      <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+      <property><name>deviceno</name>                <value>0</value></property>
+      <property><name>number_of_sb_iterations</name> <value>20</value></property>
+      <property><name>number_of_cg_iterations</name> <value>10</value></property>
+      <property><name>cg_limit</name>                <value>1e-6</value></property>
+      <property><name>oversampling_factor</name>     <value>1.25</value></property>
+      <property><name>kernel_width</name>            <value>5.5</value></property>
+      <property><name>mu</name>                      <value>0.1</value></property>
+      <property><name>lambda</name>                  <value>0.2</value></property>
+      <property><name>alpha</name>                   <value>0.5</value></property>
+      <property><name>output_convergence</name><value>true</value></property>
+    </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/sense/gadgetron_gpusense_export.h b/gadgets/sense/gadgetron_gpusense_export.h
new file mode 100644
index 0000000..7957ab5
--- /dev/null
+++ b/gadgets/sense/gadgetron_gpusense_export.h
@@ -0,0 +1,14 @@
+#ifndef GADGETRON_GPUSENSE_EXPORT_H_
+#define GADGETRON_GPUSENSE_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUSENSE__)
+#define EXPORTGADGETS_GPUSENSE __declspec(dllexport)
+#else
+#define EXPORTGADGETS_GPUSENSE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_GPUSENSE
+#endif
+
+#endif /* GADGETRON_GPUSENSE_EXPORT_H_ */
diff --git a/gadgets/sense/gpuCgKtSenseGadget.cpp b/gadgets/sense/gpuCgKtSenseGadget.cpp
new file mode 100644
index 0000000..78c7d58
--- /dev/null
+++ b/gadgets/sense/gpuCgKtSenseGadget.cpp
@@ -0,0 +1,370 @@
+#include "gpuCgKtSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDFFT.h"
+#include "Gadgetron.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "vector_td_utilities.h"
+
+//#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+  gpuCgKtSenseGadget::gpuCgKtSenseGadget()
+    : is_configured_(false)
+    , channels_(0)
+    , frame_counter_(0)
+  {
+    set_parameter(std::string("deviceno").c_str(), "0");
+    set_parameter(std::string("setno").c_str(), "0");
+    set_parameter(std::string("sliceno").c_str(), "0");
+    set_parameter(std::string("number_of_iterations").c_str(), "5");
+    set_parameter(std::string("cg_limit").c_str(), "1e-6");
+    set_parameter(std::string("oversampling_factor").c_str(), "1.25");
+    set_parameter(std::string("kernel_width").c_str(), "5.5");
+    set_parameter(std::string("kappa").c_str(), "0.3");
+    
+    matrix_size_ = uint64d2(0,0);
+    matrix_size_os_ = uint64d2(0,0);
+    matrix_size_seq_ = uint64d2(0,0);
+  }
+
+  gpuCgKtSenseGadget::~gpuCgKtSenseGadget() {}
+
+  int gpuCgKtSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+    //GADGET_DEBUG1("gpuCgKtSenseGadget::process_config\n");
+
+    device_number_ = get_int_value(std::string("deviceno").c_str());
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GADGET_DEBUG1( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GADGET_DEBUG2("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    pass_on_undesired_data_ = get_bool_value(std::string("pass_on_undesired_data").c_str());
+    set_number_ = get_int_value(std::string("setno").c_str());
+    slice_number_ = get_int_value(std::string("sliceno").c_str());
+    number_of_iterations_ = get_int_value(std::string("number_of_iterations").c_str());
+    cg_limit_ = get_double_value(std::string("cg_limit").c_str());
+    oversampling_factor_ = get_double_value(std::string("oversampling_factor").c_str());
+    kernel_width_ = get_double_value(std::string("kernel_width").c_str());
+    kappa_ = get_double_value(std::string("kappa").c_str());
+    shutter_radius_ = get_double_value(std::string("training_data_shutter_radius").c_str());
+    rotations_to_discard_ = get_int_value(std::string("rotations_to_discard").c_str());
+    output_convergence_ = get_bool_value(std::string("output_convergence").c_str());
+
+    if( (rotations_to_discard_%2) == 1 ){
+      GADGET_DEBUG1("#rotations to discard must be even.\n");
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    std::vector<long> dims;
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    //ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    //ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize().x(), r_space.matrixSize().y() );
+
+    if (!is_configured_) {
+
+      channels_ = cfg->acquisitionSystemInformation().present() ?
+	(cfg->acquisitionSystemInformation().get().receiverChannels().present() ? cfg->acquisitionSystemInformation().get().receiverChannels().get() : 1) : 1;
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianKtSenseOperator<float,2> >( new cuNonCartesianKtSenseOperator<float,2>() );
+
+      // Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      // Allocate regularization image operator
+      R_ = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+      R_->set_weight( kappa_ );
+
+      // Setup solver
+      cg_.set_encoding_operator( E_ );        // encoding matrix
+      cg_.add_regularization_operator( R_ );  // regularization matrix
+      cg_.set_preconditioner( D_ );           // preconditioning matrix
+      cg_.set_max_iterations( number_of_iterations_ );
+      cg_.set_tc_tolerance( cg_limit_ );
+      cg_.set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT );
+
+      is_configured_ = true;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuCgKtSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<SenseJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+    
+    //GADGET_DEBUG1("gpuCgKtSenseGadget::process\n");
+    //GPUTimer timer("gpuCgKtSenseGadget::process");
+
+    if (!is_configured_) {
+      GADGET_DEBUG1("Data received before configuration was completed\n");
+      return GADGET_FAIL;
+    }
+
+    SenseJob* j = m2->getObjectPtr();
+
+    // Some basic validation of the incoming Sense job
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GADGET_DEBUG1("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GADGET_DEBUG2("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n", 
+		    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+    
+    matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );    
+
+    matrix_size_os_ =
+      uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+	     ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    GADGET_DEBUG2("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);    
+    GADGET_DEBUG2("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+    std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+    image_dims.push_back(frames);
+    
+    E_->set_domain_dimensions(&image_dims);
+    E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);
+
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+        
+    R_->compute(compute_regularization_image(j).get());
+
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<float> > __precon_weights = sum(abs_square(csm.get()).get(), 2);
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = expand<float>( __precon_weights.get(), frames );
+    boost::shared_ptr<cuNDArray<float> > R_diag = R_->get();
+    *R_diag *= float(kappa_);
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    __precon_weights.reset(); _precon_weights.reset();
+    D_->set_weights( precon_weights );
+	
+    // Invoke solver
+    // 
+
+    boost::shared_ptr< cuNDArray<float_complext> > cgresult;
+    
+    {
+      GPUTimer timer("gpuCgKtSenseGadget::solve()");
+      cgresult = cg_.solve(device_samples.get());
+    }
+
+    if (!cgresult.get()) {
+      GADGET_DEBUG1("Iterative_sense_compute failed\n");
+      return GADGET_FAIL;
+    }
+
+    // Goto from x-f to x-t space
+    cuNDFFT<float>::instance()->fft( cgresult.get(), 2 );
+
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "recon_%d.real", counter);
+    write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename );
+    counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() );    
+    
+    // Now pass on the reconstructed images
+    //
+
+    unsigned int frames_per_rotation = frames/rotations;
+
+    if( rotations == 1 ){ // this is the case for golden ratio
+      rotations = frames;
+      frames_per_rotation = 1;
+    }
+
+    for( unsigned int frame=0; frame<frames; frame++ ){
+
+      unsigned int rotation_idx = frame/frames_per_rotation;
+
+      // Check if we should discard this frame
+      if( rotation_idx < (rotations_to_discard_>>1) || rotation_idx >= rotations-(rotations_to_discard_>>1) )
+	continue;
+            
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m = 
+	new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm = 
+	new GadgetContainerMessage< hoNDArray< std::complex<float> > >();      
+
+      *m->getObjectPtr() = j->image_headers_[frame];
+      m->cont(cm);
+      
+      std::vector<size_t> img_dims(2);
+      img_dims[0] = matrix_size_seq_[0];
+      img_dims[1] = matrix_size_seq_[1];
+
+      cm->getObjectPtr()->create(&img_dims);
+
+      size_t data_length = prod(matrix_size_seq_);
+
+      cudaMemcpy(cm->getObjectPtr()->get_data_ptr(),
+		 cgresult->get_data_ptr()+frame*data_length,
+		 data_length*sizeof(std::complex<float>),
+		 cudaMemcpyDeviceToHost);
+
+      cudaError_t err = cudaGetLastError();
+      if( err != cudaSuccess ){
+	GADGET_DEBUG2("Unable to copy result from device to host: %s\n", cudaGetErrorString(err));
+	m->release();
+	return GADGET_FAIL;
+      }
+
+      m->getObjectPtr()->matrix_size[0] = matrix_size_seq_[0];
+      m->getObjectPtr()->matrix_size[1] = matrix_size_seq_[1];
+      m->getObjectPtr()->matrix_size[2] = 1;
+      m->getObjectPtr()->channels       = 1;
+      m->getObjectPtr()->image_index    = frame_counter_ + frame;
+      
+      if (this->next()->putq(m) < 0) {
+	GADGET_DEBUG1("Failed to put result image on to queue\n");
+	m->release();
+	return GADGET_FAIL;
+      }
+    }
+    
+    frame_counter_ += frames;
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+  boost::shared_ptr< cuNDArray<float_complext> > gpuCgKtSenseGadget::
+  compute_regularization_image( SenseJob *job )
+  {
+    // 
+    // Estimate training data
+    // 
+
+    unsigned int num_samples = job->dat_host_->get_size(0);
+    unsigned int num_coils = job->dat_host_->get_size(1);
+    unsigned int num_rotations = num_samples / job->tra_host_->get_number_of_elements();
+    unsigned int frames_per_reconstruction = job->tra_host_->get_size(1)*num_rotations;
+
+    std::vector<size_t> dims = to_std_vector(matrix_size_os_);
+    dims.push_back(frames_per_reconstruction); 
+    dims.push_back(num_coils); 
+
+    cuNDArray<float_complext> image_os(&dims);    
+    cuNDArray<float_complext> data((job->dat_host_).get());
+    cuNDArray<float> dcw((job->dcw_host_).get());
+  
+    // Convolve to Cartesian k-space
+    //
+
+    E_->get_plan()->convolve( &data, &image_os, &dcw, cuNFFT_plan<float,2>::NFFT_CONV_NC2C );
+
+    // Apply shutter
+    //
+
+    if( shutter_radius_ < 0.0001 ){ // If not specified in the configuration then try to make an estimation
+
+      // #profiles/frame : this is just an estimate (we dont have the exact value at this stage)
+      unsigned int profiles_per_frame = num_samples / (frames_per_reconstruction*matrix_size_os_[0]);
+      shutter_radius_ = ((float)matrix_size_os_[0]/(float)matrix_size_[0])*(float)profiles_per_frame/(float)M_PI;
+      GADGET_DEBUG2("Estimated training data shutter radius: %f\n", shutter_radius_);
+    }
+
+    fill_border<float_complext,2>( shutter_radius_, &image_os );
+    E_->get_plan()->fft( &image_os, cuNFFT_plan<float,2>::NFFT_BACKWARDS );
+    E_->get_plan()->deapodize( &image_os );
+
+    // Remove oversampling
+    //
+
+    dims = to_std_vector(matrix_size_);
+    dims.push_back(frames_per_reconstruction); 
+    dims.push_back(num_coils);
+    cuNDArray<float_complext> image(&dims);
+    crop<float_complext,2>( (matrix_size_os_-matrix_size_)>>1, &image_os, &image );
+
+    // Compute regularization image
+    //
+
+    dims.pop_back();
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image( new cuNDArray<float_complext>(&dims) );
+
+    E_->mult_csm_conj_sum( &image, reg_image.get() );
+    cuNDFFT<float>::instance()->ifft( reg_image.get(), 2, true );
+
+    return reg_image;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuCgKtSenseGadget)
+}
diff --git a/gadgets/sense/gpuCgKtSenseGadget.h b/gadgets/sense/gpuCgKtSenseGadget.h
new file mode 100644
index 0000000..f2e06b6
--- /dev/null
+++ b/gadgets/sense/gpuCgKtSenseGadget.h
@@ -0,0 +1,71 @@
+#ifndef gpuCgKtSenseGadget_H
+#define gpuCgKtSenseGadget_H
+#pragma once
+
+#include "gadgetron_gpusense_export.h"
+#include "Gadget.h"
+#include "SenseJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianKtSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUSENSE gpuCgKtSenseGadget : public Gadget2<ISMRMRD::ImageHeader, SenseJob>
+  {
+
+  public:
+    GADGET_DECLARE(gpuCgKtSenseGadget);
+
+    gpuCgKtSenseGadget();
+    virtual ~gpuCgKtSenseGadget();
+
+  protected:
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1, GadgetContainerMessage< SenseJob > *m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    boost::shared_ptr< cuNDArray<float_complext> > compute_regularization_image( SenseJob *job );
+
+    int channels_;
+    int device_number_;
+    int set_number_;
+    int slice_number_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_iterations_;
+    double cg_limit_;
+    double oversampling_factor_;
+    double kernel_width_;
+    double kappa_;
+    double shutter_radius_;
+    unsigned int rotations_to_discard_;
+
+    bool output_convergence_;
+    bool is_configured_;
+
+    // Define conjugate gradient solver
+    cuCgSolver<float_complext> cg_;
+
+    // Define non-Cartesian Sense Encofing operator
+    boost::shared_ptr< cuNonCartesianKtSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Define regularization image operator
+    boost::shared_ptr< cuImageOperator<float_complext> > R_;
+
+    int frame_counter_;
+  };
+}
+#endif //gpuCgKtSenseGadget
diff --git a/gadgets/sense/gpuCgSenseGadget.cpp b/gadgets/sense/gpuCgSenseGadget.cpp
new file mode 100644
index 0000000..b795ddc
--- /dev/null
+++ b/gadgets/sense/gpuCgSenseGadget.cpp
@@ -0,0 +1,321 @@
+#include "gpuCgSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "Gadgetron.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+
+namespace Gadgetron{
+
+  gpuCgSenseGadget::gpuCgSenseGadget()
+    : is_configured_(false)
+    , channels_(0)
+    , frame_counter_(0)
+    , matrix_size_reported_(0)
+  {
+    set_parameter(std::string("deviceno").c_str(), "0");
+    set_parameter(std::string("setno").c_str(), "0");
+    set_parameter(std::string("sliceno").c_str(), "0");
+    set_parameter(std::string("number_of_iterations").c_str(), "5");
+    set_parameter(std::string("cg_limit").c_str(), "1e-6");
+    set_parameter(std::string("oversampling_factor").c_str(), "1.25");
+    set_parameter(std::string("kernel_width").c_str(), "5.5");
+    set_parameter(std::string("kappa").c_str(), "0.3");
+    
+    matrix_size_ = uint64d2(0,0);
+    matrix_size_os_ = uint64d2(0,0);
+    matrix_size_seq_ = uint64d2(0,0);
+  }
+
+  gpuCgSenseGadget::~gpuCgSenseGadget() {}
+
+  int gpuCgSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+    //GADGET_DEBUG1("gpuCgSenseGadget::process_config\n");
+
+    device_number_ = get_int_value(std::string("deviceno").c_str());
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GADGET_DEBUG1( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GADGET_DEBUG2("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    pass_on_undesired_data_ = get_bool_value(std::string("pass_on_undesired_data").c_str());
+    set_number_ = get_int_value(std::string("setno").c_str());
+    slice_number_ = get_int_value(std::string("sliceno").c_str());
+    number_of_iterations_ = get_int_value(std::string("number_of_iterations").c_str());
+    cg_limit_ = get_double_value(std::string("cg_limit").c_str());
+    oversampling_factor_ = get_double_value(std::string("oversampling_factor").c_str());
+    kernel_width_ = get_double_value(std::string("kernel_width").c_str());
+    kappa_ = get_double_value(std::string("kappa").c_str());
+    output_convergence_ = get_bool_value(std::string("output_convergence").c_str());
+    output_timing_ = get_bool_value(std::string("output_timing").c_str());
+    rotations_to_discard_ = get_int_value(std::string("rotations_to_discard").c_str());
+
+    if( (rotations_to_discard_%2) == 1 ){
+      GADGET_DEBUG1("#rotations to discard must be even.\n");
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    std::vector<long> dims;
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    //ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    //ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize().x(), r_space.matrixSize().y() );
+
+    if (!is_configured_) {
+
+      channels_ = cfg->acquisitionSystemInformation().present() ?
+	(cfg->acquisitionSystemInformation().get().receiverChannels().present() ? cfg->acquisitionSystemInformation().get().receiverChannels().get() : 1) : 1;
+
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );
+
+      // Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      // Allocate regularization image operator
+      R_ = boost::shared_ptr< cuImageOperator<float_complext> >( new cuImageOperator<float_complext>() );
+      R_->set_weight( kappa_ );
+
+      // Setup solver
+      cg_.set_encoding_operator( E_ );        // encoding matrix
+      cg_.add_regularization_operator( R_ );  // regularization matrix
+      cg_.set_preconditioner( D_ );           // preconditioning matrix
+      cg_.set_max_iterations( number_of_iterations_ );
+      cg_.set_tc_tolerance( cg_limit_ );
+      cg_.set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT);
+
+      is_configured_ = true;
+    }
+
+    return GADGET_OK;
+  }
+
+  int gpuCgSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<SenseJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+    
+    //GADGET_DEBUG1("gpuCgSenseGadget::process\n");
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSenseGadget::process()") );
+    
+    if (!is_configured_) {
+      GADGET_DEBUG1("Data received before configuration was completed\n");
+      return GADGET_FAIL;
+    }
+
+    SenseJob* j = m2->getObjectPtr();
+
+    // Some basic validation of the incoming Sense job
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get() || !j->reg_host_.get()) {
+      GADGET_DEBUG1("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GADGET_DEBUG2("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n", 
+		    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+    
+    matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );    
+
+    matrix_size_os_ =
+      uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+	     ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+
+    if( !matrix_size_reported_ ) {
+      GADGET_DEBUG2("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);    
+      GADGET_DEBUG2("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+      matrix_size_reported_ = true;
+    }
+
+    std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+    image_dims.push_back(frames);
+    
+    E_->set_domain_dimensions(&image_dims);
+    E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);
+
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image(new cuNDArray<float_complext> (j->reg_host_.get()));
+    R_->compute(reg_image.get());
+
+    // Define preconditioning weights
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
+    boost::shared_ptr<cuNDArray<float> > R_diag = R_->get();
+    *R_diag *= float(kappa_);
+    *_precon_weights += *R_diag;
+    R_diag.reset();
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights );
+	
+    // Invoke solver
+    // 
+
+    boost::shared_ptr< cuNDArray<float_complext> > cgresult;
+    
+    {
+      boost::shared_ptr<GPUTimer> solve_timer;
+      if( output_timing_ )
+        solve_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSenseGadget::solve()") );
+      
+      cgresult = cg_.solve(device_samples.get());
+      
+      if( output_timing_ )
+        solve_timer.reset();
+    }
+    
+    if (!cgresult.get()) {
+      GADGET_DEBUG1("Iterative_sense_compute failed\n");
+      return GADGET_FAIL;
+    }
+
+    /*
+      static int counter = 0;
+      char filename[256];
+      sprintf((char*)filename, "recon_%d.real", counter);
+      write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename );
+      counter++; 
+    */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() );    
+    
+    // Now pass on the reconstructed images
+    //
+
+    unsigned int frames_per_rotation = frames/rotations;
+
+    if( rotations == 1 ){ // this is the case for golden ratio
+      rotations = frames;
+      frames_per_rotation = 1;
+    }
+
+    for( unsigned int frame=0; frame<frames; frame++ ){
+      
+      unsigned int rotation_idx = frame/frames_per_rotation;
+
+      // Check if we should discard this frame
+      if( rotation_idx < (rotations_to_discard_>>1) || rotation_idx >= rotations-(rotations_to_discard_>>1) )
+        continue;
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m = 
+        new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm = 
+        new GadgetContainerMessage< hoNDArray< std::complex<float> > >();      
+      
+      *m->getObjectPtr() = j->image_headers_[frame];
+      m->cont(cm);
+      
+      std::vector<size_t> img_dims(2);
+      img_dims[0] = matrix_size_seq_[0];
+      img_dims[1] = matrix_size_seq_[1];
+
+      cm->getObjectPtr()->create(&img_dims);
+
+      size_t data_length = prod(matrix_size_seq_);
+
+      cudaMemcpy(cm->getObjectPtr()->get_data_ptr(),
+                 cgresult->get_data_ptr()+frame*data_length,
+                 data_length*sizeof(std::complex<float>),
+                 cudaMemcpyDeviceToHost);
+      
+      cudaError_t err = cudaGetLastError();
+      if( err != cudaSuccess ){
+        GADGET_DEBUG2("Unable to copy result from device to host: %s\n", cudaGetErrorString(err));
+        m->release();
+        return GADGET_FAIL;
+      }
+
+      m->getObjectPtr()->matrix_size[0] = matrix_size_seq_[0];
+      m->getObjectPtr()->matrix_size[1] = matrix_size_seq_[1];
+      m->getObjectPtr()->matrix_size[2] = 1;
+      m->getObjectPtr()->channels       = 1;
+      m->getObjectPtr()->image_index    = frame_counter_ + frame;
+            
+      if (this->next()->putq(m) < 0) {
+        GADGET_DEBUG1("Failed to put result image on to queue\n");
+        m->release();
+        return GADGET_FAIL;
+      }
+    }
+    
+    frame_counter_ += frames;
+
+    if( output_timing_ )
+      process_timer.reset();
+
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuCgSenseGadget)
+}
diff --git a/gadgets/sense/gpuCgSenseGadget.h b/gadgets/sense/gpuCgSenseGadget.h
new file mode 100644
index 0000000..fd954b7
--- /dev/null
+++ b/gadgets/sense/gpuCgSenseGadget.h
@@ -0,0 +1,71 @@
+#ifndef gpuCgSenseGadget_H
+#define gpuCgSenseGadget_H
+#pragma once
+
+#include "gadgetron_gpusense_export.h"
+#include "Gadget.h"
+#include "SenseJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+
+#include <ismrmrd.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUSENSE gpuCgSenseGadget : public Gadget2<ISMRMRD::ImageHeader, SenseJob>
+  {
+
+  public:
+
+    GADGET_DECLARE(gpuCgSenseGadget);
+
+    gpuCgSenseGadget();
+    virtual ~gpuCgSenseGadget();
+
+  protected:
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader > *m1, GadgetContainerMessage< SenseJob > *m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    int channels_;
+    int device_number_;
+    int set_number_;
+    int slice_number_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_iterations_;
+    double cg_limit_;
+    double oversampling_factor_;
+    double kernel_width_;
+    double kappa_;
+    unsigned int rotations_to_discard_;
+
+    bool output_convergence_;
+    bool output_timing_;
+    bool matrix_size_reported_;
+    bool is_configured_;
+
+    // Define conjugate gradient solver
+    cuCgSolver<float_complext> cg_;
+
+    // Define non-Cartesian Sense Encofing operator
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Define regularization image operator
+    boost::shared_ptr< cuImageOperator<float_complext> > R_;
+
+    unsigned int frame_counter_;
+  };
+}
+#endif //gpuCgSenseGadget
diff --git a/gadgets/sense/gpuGenericSensePrepGadget.cpp b/gadgets/sense/gpuGenericSensePrepGadget.cpp
new file mode 100644
index 0000000..3815f23
--- /dev/null
+++ b/gadgets/sense/gpuGenericSensePrepGadget.cpp
@@ -0,0 +1,948 @@
+#include "gpuGenericSensePrepGadget.h"
+#include "Gadgetron.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "SenseJob.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "hoNDArray_utils.h"
+#include "vector_td_operators.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "check_CUDA.h"
+#include "hoNDArray_fileio.h"
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  gpuGenericSensePrepGadget::gpuGenericSensePrepGadget()
+    : slices_(-1)
+    , sets_(-1)
+    , device_number_(-1)
+    , samples_per_readout_(-1)
+  {
+    // Set some default values in case the config does not contain a specification
+    //
+
+    set_parameter(std::string("deviceno").c_str(), "0");
+    set_parameter(std::string("rotations_per_reconstruction").c_str(), "0");
+    set_parameter(std::string("propagate_csm_from_set").c_str(), "-1");
+    set_parameter(std::string("buffer_length_in_rotations").c_str(), "0");
+    set_parameter(std::string("buffer_using_solver").c_str(), "false");
+    set_parameter(std::string("buffer_convolution_kernel_width").c_str(), "5.5");
+    set_parameter(std::string("buffer_convolution_oversampling_factor").c_str(), "1.25");
+    set_parameter(std::string("reconstruction_os_factor_x").c_str(), "1.0");
+    set_parameter(std::string("reconstruction_os_factor_y").c_str(), "1.0");
+  }
+  
+  gpuGenericSensePrepGadget::~gpuGenericSensePrepGadget() {}
+  
+  int gpuGenericSensePrepGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Get configuration values from config file
+    //
+
+    device_number_ = get_int_value(std::string("deviceno").c_str());
+    rotations_per_reconstruction_ = get_int_value(std::string("rotations_per_reconstruction").c_str());
+    buffer_length_in_rotations_ = get_int_value(std::string("buffer_length_in_rotations").c_str());
+    buffer_using_solver_ = get_bool_value(std::string("buffer_using_solver").c_str());
+    output_timing_ = get_bool_value(std::string("output_timing").c_str());
+
+    // Currently there are some restrictions on the allowed sliding window configurations
+    //
+    
+    sliding_window_readouts_ = get_int_value(std::string("sliding_window_readouts").c_str());
+    sliding_window_rotations_ = get_int_value(std::string("sliding_window_rotations").c_str());
+
+    if( sliding_window_readouts_>0 && sliding_window_rotations_>0 ){
+      GADGET_DEBUG1( "Error: Sliding window reconstruction is not yet supported for both readouts and frames simultaneously.\n" );
+      return GADGET_FAIL;
+    }
+
+    if( sliding_window_readouts_>0 && rotations_per_reconstruction_>0 ){
+      GADGET_DEBUG1( "Error: Sliding window reconstruction over readouts is not yet supported for multiframe reconstructions.\n" );
+      return GADGET_FAIL;
+    }
+    
+    if( sliding_window_rotations_ > 0 && sliding_window_rotations_ >= rotations_per_reconstruction_ ){
+      GADGET_DEBUG1( "Error: Illegal sliding window configuration.\n" );
+      return GADGET_FAIL;
+    }
+
+    // Setup and validate device configuration
+    //
+
+    int number_of_devices;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GADGET_DEBUG1( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GADGET_DEBUG2("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    // It is possible to specify one set to use for csm propagation, and then propagate this to all sets
+    //
+
+    propagate_csm_from_set_ = get_int_value(std::string("propagate_csm_from_set").c_str());
+
+    if( propagate_csm_from_set_ > 0 ){
+      GADGET_DEBUG2("Currently, only set 0 can propagate coil sensitivity maps. Set %d was specified.\n", propagate_csm_from_set_ );
+      return GADGET_FAIL;
+    }
+
+    if( propagate_csm_from_set_ >= 0 ){
+      GADGET_DEBUG2("Propagating csm from set %d to all sets\n", propagate_csm_from_set_ );
+    }
+
+    // Convolution kernel width and oversampling ratio (for the buffer)
+    //
+
+    kernel_width_ = get_double_value(std::string("buffer_convolution_kernel_width").c_str());
+    oversampling_factor_ = get_double_value(std::string("buffer_convolution_oversampling_factor").c_str());
+
+    // Get the Ismrmrd header
+    //
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+    
+    if( cfg.get() == 0x0 ){
+      GADGET_DEBUG1("Unable to parse Ismrmrd header\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    // Matrix sizes (as a multiple of the GPU's warp size)
+    //
+    
+    image_dimensions_.push_back(e_space.matrixSize().x());
+    image_dimensions_.push_back(e_space.matrixSize().y());
+
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize().x()*get_double_value(std::string("reconstruction_os_factor_x").c_str())))+warp_size-1)/warp_size)*warp_size);  
+
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize().y()*get_double_value(std::string("reconstruction_os_factor_y").c_str())))+warp_size-1)/warp_size)*warp_size);
+    
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]); 
+    
+    GADGET_DEBUG2("matrix_size_x : %d, recon: %d, recon_os: %d\n", 
+		  image_dimensions_[0], image_dimensions_recon_[0], image_dimensions_recon_os_[0]);
+
+    GADGET_DEBUG2("matrix_size_y : %d, recon: %d, recon_os: %d\n", 
+		  image_dimensions_[1], image_dimensions_recon_[1], image_dimensions_recon_os_[1]);
+    
+    fov_.push_back(r_space.fieldOfView_mm().x());
+    fov_.push_back(r_space.fieldOfView_mm().y());
+    fov_.push_back(r_space.fieldOfView_mm().z());
+
+    slices_ = e_limits.slice().present() ? e_limits.slice().get().maximum() + 1 : 1;
+    sets_ = e_limits.set().present() ? e_limits.set().get().maximum() + 1 : 1;
+    
+    // Allocate readout and trajectory queues
+    // - one queue for the currently incoming frame
+    // - one queue for the upcoming reconstruction
+
+    frame_readout_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_readout_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    frame_traj_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    recon_traj_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    image_headers_queue_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+    
+    size_t bsize = sizeof(GadgetContainerMessage< hoNDArray< std::complex<float> > >)*image_dimensions_[0]*10;
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      frame_readout_queue_[i].high_water_mark(bsize);
+      frame_readout_queue_[i].low_water_mark(bsize);
+      frame_traj_queue_[i].high_water_mark(bsize);
+      frame_traj_queue_[i].low_water_mark(bsize);
+    }
+    
+    bsize *= (rotations_per_reconstruction_+1);
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      recon_readout_queue_[i].high_water_mark(bsize);
+      recon_readout_queue_[i].low_water_mark(bsize);
+      recon_traj_queue_[i].high_water_mark(bsize);
+      recon_traj_queue_[i].low_water_mark(bsize);
+    }
+    
+    // Define various per slice/set variables
+    //
+
+    previous_readout_no_ = boost::shared_array<long>(new long[slices_*sets_]);
+    acceleration_factor_ = boost::shared_array<long>(new long[slices_*sets_]);
+    image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+    readout_counter_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    readout_counter_global_= boost::shared_array<long>(new long[slices_*sets_]);
+    readouts_per_frame_= boost::shared_array<long>(new long[slices_*sets_]);
+    frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_frames_per_rotation_= boost::shared_array<long>(new long[slices_*sets_]);
+    buffer_update_needed_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    reconfigure_ = boost::shared_array<bool>(new bool[slices_*sets_]);
+    num_coils_ = boost::shared_array<unsigned int>(new unsigned int[slices_*sets_]);
+    
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+
+      previous_readout_no_[i] = -1;
+      acceleration_factor_[i] = -1;
+      image_counter_[i] = 0;
+      readout_counter_frame_[i] = 0;
+      readout_counter_global_[i] = 0;
+      readouts_per_frame_[i] = get_int_value(std::string("readouts_per_frame").c_str());
+      frames_per_rotation_[i] = get_int_value(std::string("frames_per_rotation").c_str());
+      buffer_frames_per_rotation_[i] = get_int_value(std::string("buffer_frames_per_rotation").c_str());
+      num_coils_[i] = 0;
+      buffer_update_needed_[i] = true;
+      reconfigure_[i] = true;
+
+      // Assign some default values ("upper bound estimates") of the (possibly) unknown entities
+      //
+      
+      if( readouts_per_frame_[i] == 0 ){
+	readouts_per_frame_[i] = image_dimensions_[0];
+      }
+      
+      if( frames_per_rotation_[i] == 0 ){
+	frames_per_rotation_[i] = image_dimensions_[0]/readouts_per_frame_[i];
+      }
+
+      // Also remember to set the high/low water marks of the ISMRMRD image header queue
+      //
+
+      bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*
+	std::max(1L, frames_per_rotation_[i]*rotations_per_reconstruction_);
+    
+      image_headers_queue_[i].high_water_mark(bsize);
+      image_headers_queue_[i].low_water_mark(bsize);
+    }
+
+    // If need be the following limitation can be lifted, but it would be a little tedious... 
+    //
+
+    if( buffer_using_solver_ && rotations_per_reconstruction_ < 1 ) {
+      GADGET_DEBUG1("Error: when buffering using a cg solver, 'rotations_per_reconstruction' must be specified (and strictly positive).");
+    }
+
+    if( buffer_using_solver_ && ( buffer_frames_per_rotation_[0] > 0 || buffer_length_in_rotations_ > 0 ) ){
+      GADGET_DEBUG1("Error: when buffering using a cg solver, we currently do not support specification of 'buffer_frames_per_rotation' or 'buffer_length_in_rotations'. These values are instead automatically set to match the reconstruction settings.\n");
+      return GADGET_FAIL;
+    }
+            
+    position_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    read_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    phase_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+    slice_dir_ = boost::shared_array<float[3]>(new float[slices_*sets_][3]);
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      (position_[i])[0] = (position_[i])[1] = (position_[i])[2] = 0.0f;
+      (read_dir_[i])[0] = (read_dir_[i])[1] = (read_dir_[i])[2] = 0.0f;
+      (phase_dir_[i])[0] = (phase_dir_[i])[1] = (phase_dir_[i])[2] = 0.0f;
+      (slice_dir_[i])[0] = (slice_dir_[i])[1] = (slice_dir_[i])[2] = 0.0f;
+    }
+
+    // Allocate accumulation buffer
+    //
+
+    if( buffer_using_solver_ )
+      acc_buffer_cg_ = boost::shared_array< cuSenseBufferCg<float,2> >(new cuSenseBufferCg<float,2>[slices_*sets_]);
+    else
+      acc_buffer_ = boost::shared_array< cuSenseBuffer<float,2> >(new cuSenseBuffer<float,2>[slices_*sets_]);
+    
+    // Allocate remaining shared_arrays
+    //
+    
+    csm_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+    reg_host_ = boost::shared_array< hoNDArray<float_complext> >(new hoNDArray<float_complext>[slices_*sets_]);
+
+    return GADGET_OK;
+  }
+
+  int gpuGenericSensePrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,           // header
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,   // data
+	  GadgetContainerMessage< hoNDArray<float> > *m3)                   // traj/dcw
+  {
+    // Noise should have been consumed by the noise adjust (if in the gadget chain)
+    //
+    
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) { 
+      m1->release();
+      return GADGET_OK;
+    }
+
+    // Setup timer if asked for
+    //
+
+    boost::shared_ptr<GPUTimer> process_timer;
+    if( output_timing_ )
+      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuGenericSensePrepGadget::process()") );
+
+    // Some convienient utility variables
+    //
+
+    unsigned int set = m1->getObjectPtr()->idx.set;
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int readout = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    unsigned int idx = set*slices_+slice;
+
+    // Get a pointer to the accumulation buffer. 
+    //
+
+    cuSenseBuffer<float,2> *acc_buffer = 
+      (buffer_using_solver_) ? &acc_buffer_cg_[idx] : &acc_buffer_[idx];
+
+    // Have the imaging plane changed?
+    //
+
+    if( !vec_equal(position_[idx], m1->getObjectPtr()->position) ||
+	!vec_equal(read_dir_[idx], m1->getObjectPtr()->read_dir) || 
+	!vec_equal(phase_dir_[idx], m1->getObjectPtr()->phase_dir) ||
+	!vec_equal(slice_dir_[idx], m1->getObjectPtr()->slice_dir) ){
+      
+      // Yes indeed, clear the accumulation buffer and update structs
+      //
+
+      acc_buffer->clear();
+      buffer_update_needed_[idx] = true;
+      
+      memcpy(position_[idx],m1->getObjectPtr()->position,3*sizeof(float));
+      memcpy(read_dir_[idx],m1->getObjectPtr()->read_dir,3*sizeof(float));
+      memcpy(phase_dir_[idx],m1->getObjectPtr()->phase_dir,3*sizeof(float));
+      memcpy(slice_dir_[idx],m1->getObjectPtr()->slice_dir,3*sizeof(float));
+    }
+    
+    // Only when the first readout arrives, do we know the #samples/readout
+    //
+
+    if( samples_per_readout_ == -1 )      
+      samples_per_readout_ = m1->getObjectPtr()->number_of_samples;
+    
+    if( samples_per_readout_ != m1->getObjectPtr()->number_of_samples ){
+      GADGET_DEBUG1("Unexpected change in the readout length\n");
+      return GADGET_FAIL;
+    }
+    
+    bool new_frame_detected = false;
+
+    // Reconfigure at first pass
+    // - or if the number of coil changes
+    // - or if the reconfigure_ flag is set
+
+    if( num_coils_[idx] != m1->getObjectPtr()->active_channels ){
+      GADGET_DEBUG1("Reconfiguring (the number of coils changed)\n");
+      num_coils_[idx] = m1->getObjectPtr()->active_channels;
+      reconfigure(set, slice);
+    }
+
+    if( reconfigure_[idx] ){
+      GADGET_DEBUG1("Reconfiguring (due to boolean indicator)\n");
+      reconfigure(set, slice);
+    }
+
+    // Keep track of the incoming readout ids
+    // - to determine the number of readouts per frame
+    // - to determine the number of frames per rotation
+
+    if (previous_readout_no_[idx] >= 0) {
+
+      if ( readout > previous_readout_no_[idx]) { 
+	// This is not the last readout in the frame.
+	// Make an estimate of the acceleration factor
+	//
+	
+	long tmp_accel = readout - previous_readout_no_[idx];
+
+	if( acceleration_factor_[idx] != tmp_accel )
+	  GADGET_DEBUG2("Detected an acceleration factor of %d\n", tmp_accel);
+	
+	acceleration_factor_[idx] = tmp_accel;
+      }
+      else{ 
+
+	// This is the first readout in a new frame
+	//
+
+	if( get_int_value(std::string("readouts_per_frame").c_str()) == 0 &&
+	    readout_counter_frame_[idx] > 0 &&
+	    readout_counter_frame_[idx] != readouts_per_frame_[idx] ){ 
+
+	  // A new acceleration factor is detected
+	  //
+
+	  GADGET_DEBUG1("Reconfiguring (acceleration factor changed)\n");
+
+	  new_frame_detected = true;
+	  readouts_per_frame_[idx] = readout_counter_frame_[idx];
+
+	    // Assume that #frames/rotation equals the acceleration factor
+	    // If not, or if we cannot deduce the acceleration factor from the difference
+	    // of two subsequent readout ids, then 'frames_per_rotation' have to be specified in the config...
+	    //
+	    
+	  if( get_int_value(std::string("frames_per_rotation").c_str()) == 0 ) {
+	    frames_per_rotation_[idx] = acceleration_factor_[idx];
+	  }
+	  reconfigure(set, slice);
+	}
+      }
+    }
+    previous_readout_no_[idx] = readout;
+
+    // Enqueue readout
+    // - unless 'new_frame_detected', then the current readout does not belong to the current frame and we delay enqueing
+
+    if( !new_frame_detected ) {
+      
+      // Memory handling is easier if we make copies for our internal queues
+      frame_readout_queue_[idx].enqueue_tail(duplicate_array(m2));
+      recon_readout_queue_[idx].enqueue_tail(duplicate_array(m2));
+      frame_traj_queue_[idx].enqueue_tail(duplicate_array(m3));
+      recon_traj_queue_[idx].enqueue_tail(duplicate_array(m3));
+    }
+
+    // If the readout is the last of a "true frame" (ignoring any sliding window readouts)
+    // - then update the accumulation buffer
+
+    bool is_last_readout_in_frame = (readout_counter_frame_[idx] == readouts_per_frame_[idx]-1);
+    is_last_readout_in_frame |= new_frame_detected;
+
+    cuNDArray<floatd2> traj;
+    cuNDArray<float> dcw;
+    
+    if( is_last_readout_in_frame ){
+
+      // Get ready to update the csm/regularization buffer
+      //
+
+      // Extract this frame's samples 
+      //
+
+      boost::shared_ptr< hoNDArray<float_complext> > host_samples = 
+	extract_samples_from_queue( &frame_readout_queue_[idx], false, set, slice );
+            
+      cuNDArray<float_complext> samples( host_samples.get() );
+
+      // Extract this frame's trajectory and dcw.
+      //
+
+      extract_trajectory_and_dcw_from_queue( &frame_traj_queue_[idx], false, set, slice, 
+					     samples_per_readout_*readouts_per_frame_[idx], 1,
+					     &traj, &dcw );
+
+      // Scale dcw weights to the are of the oversampled recon matrix size
+      float scale_factor = float(prod(image_dimensions_recon_os_))/asum(&dcw);
+      dcw *= scale_factor;
+      
+      // Add this frame to the buffer
+      //
+
+      acc_buffer->set_dcw(boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(&dcw)));
+      buffer_update_needed_[idx] |= acc_buffer->add_frame_data( &samples, &traj );
+    }
+
+    // Are we ready to reconstruct (downstream)?
+    //
+
+    long readouts_per_reconstruction = readouts_per_frame_[idx];
+
+    if( rotations_per_reconstruction_ > 0 )
+      readouts_per_reconstruction *= (frames_per_rotation_[idx]*rotations_per_reconstruction_);
+    
+    bool is_last_readout_in_reconstruction = ( recon_readout_queue_[idx].message_count() == readouts_per_reconstruction );
+
+    // Prepare the image header for this frame
+    // - if this is indeed the last profile of a new frame
+    // - or if we are about to reconstruct due to 'sliding_window_profiles_' > 0
+    
+    if( is_last_readout_in_frame || 
+	(is_last_readout_in_reconstruction && image_headers_queue_[idx].message_count() == 0) ){
+      
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *header = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+      ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+
+      {
+	// Initialize header to all zeroes (there is a few fields we do not set yet)
+	ISMRMRD::ImageHeader tmp = {0};
+	*(header->getObjectPtr()) = tmp;
+      }
+
+      header->getObjectPtr()->version = base_head->version;
+
+      header->getObjectPtr()->matrix_size[0] = image_dimensions_recon_[0];
+      header->getObjectPtr()->matrix_size[1] = image_dimensions_recon_[1];
+      header->getObjectPtr()->matrix_size[2] = std::max(1L,frames_per_rotation_[idx]*rotations_per_reconstruction_);
+
+      header->getObjectPtr()->field_of_view[0] = fov_[0];
+      header->getObjectPtr()->field_of_view[1] = fov_[1];
+      header->getObjectPtr()->field_of_view[2] = fov_[2];
+
+      header->getObjectPtr()->channels = num_coils_[idx];
+      header->getObjectPtr()->slice = base_head->idx.slice;
+      header->getObjectPtr()->set = base_head->idx.set;
+
+      header->getObjectPtr()->acquisition_time_stamp = base_head->acquisition_time_stamp;
+      memcpy(header->getObjectPtr()->physiology_time_stamp, base_head->physiology_time_stamp, sizeof(uint32_t)*ISMRMRD_PHYS_STAMPS);
+
+      memcpy(header->getObjectPtr()->position, base_head->position, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->read_dir, base_head->read_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->phase_dir, base_head->phase_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->slice_dir, base_head->slice_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+
+      header->getObjectPtr()->image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+      header->getObjectPtr()->image_index = image_counter_[idx]++; 
+      header->getObjectPtr()->image_series_index = idx;
+
+      image_headers_queue_[idx].enqueue_tail(header);
+    }
+    
+    // If it is time to reconstruct (downstream) then prepare the Sense job
+    // 
+
+    if( is_last_readout_in_reconstruction ){
+      
+      // Update csm and regularization images if the buffer has changed (completed a cycle) 
+      // - and at the first pass
+
+      if( buffer_update_needed_[idx] || 
+	  csm_host_[idx].get_number_of_elements() == 0 || 
+	  reg_host_[idx].get_number_of_elements() == 0 ){
+
+	// Get the accumulated coil images
+	//
+
+	boost::shared_ptr< cuNDArray<float_complext> > csm_data = acc_buffer->get_accumulated_coil_images();
+
+ 	// Estimate CSM
+	//
+
+	if( propagate_csm_from_set_ < 0 || propagate_csm_from_set_ == set ){	  	  
+	  csm_ = estimate_b1_map<float,2>( csm_data.get() );
+	}
+	else{
+	  GADGET_DEBUG2("Set %d is reusing the csm from set %d\n", set, propagate_csm_from_set_);
+	  if( csm_.get() == 0x0 ){
+	    GADGET_DEBUG1("Error: csm has not been computed, cannot propagate\n");
+	    return GADGET_FAIL;
+	  }	  
+	}
+
+	acc_buffer->set_csm(csm_);
+	csm_host_[idx] = *(csm_->to_host());
+	
+	// Compute regularization image
+	//
+
+	boost::shared_ptr< cuNDArray<float_complext> > reg_image;
+	std::vector<size_t> dims;
+    	
+	if( buffer_using_solver_ ){
+
+	  //GPUTimer timer("\n\n AVOIDABLE PREPROCESSING. HOW EXPENSIVE?\n\n");
+
+	  extract_trajectory_and_dcw_from_queue( &recon_traj_queue_[idx], true, set, slice, 
+						 samples_per_readout_*readouts_per_frame_[idx],
+						 std::max(1L, frames_per_rotation_[idx]*rotations_per_reconstruction_),
+						 &traj, &dcw );
+
+	  // Scale dcw weights to the are of the oversampled recon matrix size
+	  float scale_factor = float(prod(image_dimensions_recon_os_))/asum(&dcw);
+	  dcw *= scale_factor;
+
+	  dims = *traj.get_dimensions();
+
+ 	  std::vector<size_t> tmp_dims;
+	  tmp_dims.push_back(dims[0]*dims[1]);
+	  tmp_dims.push_back(1);
+	  
+	  traj.reshape(&tmp_dims);
+	  dcw.reshape(&tmp_dims);
+	  
+	  ((cuSenseBufferCg<float,2>*)acc_buffer)->preprocess(&traj);
+	  ((cuSenseBufferCg<float,2>*)acc_buffer)->set_dcw_for_rhs(boost::shared_ptr< cuNDArray<float> >(new cuNDArray<float>(&dcw)));
+	}
+
+	reg_image = acc_buffer->get_combined_coil_image();	
+	reg_host_[idx] = *(reg_image->to_host());
+	
+	if( buffer_using_solver_ ){
+	  traj.reshape(&dims);
+	  dcw.reshape(&dims);
+	}
+	
+	/*
+	static int counter = 0;
+	char filename[256];
+	sprintf((char*)filename, "reg_%d.cplx", counter);
+	write_nd_array<float_complext>( &reg_host_[idx], filename );
+	counter++; */
+
+	buffer_update_needed_[idx] = false;
+      }
+
+      // Prepare data array for the downstream reconstruction
+      //
+      
+      boost::shared_ptr< hoNDArray<float_complext> > samples_host = 
+	extract_samples_from_queue( &recon_readout_queue_[idx], true, set, slice );
+      
+      // Preapre the trajectory and dcw arrays.
+      // They have already been computed above 
+      // - if 'rotations_per_reconstruction_' is 0
+      // - if 'buffer_using_solver_' is true
+      
+      if( !(/*rotations_per_reconstruction_ == 0 ||*/ buffer_using_solver_) ){
+      	extract_trajectory_and_dcw_from_queue( &recon_traj_queue_[idx], true, set, slice, 
+					       samples_per_readout_*readouts_per_frame_[idx],
+					       std::max(1L, frames_per_rotation_[idx]*rotations_per_reconstruction_),
+					       &traj, &dcw );
+      }
+
+      // Set up the Sense job
+      //
+
+      GadgetContainerMessage< SenseJob > *sj = new GadgetContainerMessage<SenseJob>();
+      	
+      sj->getObjectPtr()->dat_host_ = samples_host;      
+      sj->getObjectPtr()->tra_host_ = traj.to_host();
+      sj->getObjectPtr()->dcw_host_ = dcw.to_host();
+      sj->getObjectPtr()->csm_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(csm_host_[idx]));
+      sj->getObjectPtr()->reg_host_ = boost::shared_ptr< hoNDArray<float_complext> >( new hoNDArray<float_complext>(reg_host_[idx]));
+      
+      // Pull the image headers out of the queue
+      //
+
+      long frames_per_reconstruction = 
+	std::max( 1L, frames_per_rotation_[idx]*rotations_per_reconstruction_ );
+      
+      if( image_headers_queue_[idx].message_count() != frames_per_reconstruction ){
+	sj->release();
+	GADGET_DEBUG2("Unexpected size of image header queue: %d, %d\n", 
+		      image_headers_queue_[idx].message_count(), frames_per_reconstruction);
+	return GADGET_FAIL;
+      }
+      
+      sj->getObjectPtr()->image_headers_ =
+      boost::shared_array<ISMRMRD::ImageHeader>( new ISMRMRD::ImageHeader[frames_per_reconstruction] );
+      
+      for( unsigned int i=0; i<frames_per_reconstruction; i++ ){	
+
+	ACE_Message_Block *mbq;
+
+	if( image_headers_queue_[idx].dequeue_head(mbq) < 0 ) {
+	  sj->release();
+	  GADGET_DEBUG1("Image header dequeue failed\n");
+	  return GADGET_FAIL;
+	}
+	
+	GadgetContainerMessage<ISMRMRD::ImageHeader> *m = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	sj->getObjectPtr()->image_headers_[i] = *m->getObjectPtr();
+
+	// In sliding window mode the header might need to go back at the end of the queue for reuse
+	// 
+	
+	if( i >= frames_per_reconstruction-sliding_window_rotations_*frames_per_rotation_[idx] ){
+	  image_headers_queue_[idx].enqueue_tail(m);
+	}
+	else {
+	  m->release();
+	}
+      }
+      
+      // The Sense Job needs an image header as well. 
+      // Let us just copy the initial one...
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m4 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+
+      *m4->getObjectPtr() = sj->getObjectPtr()->image_headers_[0];
+      m4->cont(sj);
+
+      // Pass the Sense job downstream
+      //
+      
+      if (this->next()->putq(m4) < 0) {
+	GADGET_DEBUG1("Failed to put job on queue.\n");
+	m4->release();
+	return GADGET_FAIL;
+      }
+    }
+    
+    if( is_last_readout_in_frame )
+      readout_counter_frame_[idx] = 0;
+    else{
+      readout_counter_frame_[idx]++;
+    }
+
+    if( new_frame_detected ){
+
+      // The incoming profile was actually the first readout of the next frame, enqueue.
+      //
+
+      frame_readout_queue_[idx].enqueue_tail(duplicate_array(m2));
+      recon_readout_queue_[idx].enqueue_tail(duplicate_array(m2)); 
+      frame_traj_queue_[idx].enqueue_tail(duplicate_array(m3));
+      recon_traj_queue_[idx].enqueue_tail(duplicate_array(m3)); 
+
+      readout_counter_frame_[idx]++;
+    }
+
+    readout_counter_global_[idx]++;
+
+    if( output_timing_ )
+      process_timer.reset();
+    
+    m1->release(); // this is safe, the internal queues hold copies
+    return GADGET_OK;
+  }
+  
+  boost::shared_ptr< hoNDArray<float_complext> > 
+  gpuGenericSensePrepGadget::extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+							  bool sliding_window, unsigned int set, unsigned int slice )
+  {    
+    unsigned int readouts_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(samples_per_readout_*readouts_buffered);
+    dims.push_back(num_coils_[set*slices_+slice]);
+    
+    boost::shared_ptr< hoNDArray<float_complext> > host_samples(new hoNDArray<float_complext>(&dims));
+    
+    for (unsigned int p=0; p<readouts_buffered; p++) {
+      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+	GADGET_DEBUG1("Message dequeue failed\n");
+	throw std::runtime_error("gpuGenericSensePrepGadget::extract_samples_from_queue: dequeing failed");	
+      }
+      
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *daq = AsContainerMessage<hoNDArray< std::complex<float> > >(mbq);
+	
+      if (!daq) {
+	GADGET_DEBUG1("Unable to interpret data on message queue\n");
+	throw std::runtime_error("gpuGenericSensePrepGadget::extract_samples_from_queue: failed to interpret data");	
+      }
+	
+      for (unsigned int c = 0; c < num_coils_[set*slices_+slice]; c++) {
+	
+	float_complext *data_ptr = host_samples->get_data_ptr();
+	data_ptr += c*samples_per_readout_*readouts_buffered+p*samples_per_readout_;
+	    
+	std::complex<float> *r_ptr = daq->getObjectPtr()->get_data_ptr();
+	r_ptr += c*daq->getObjectPtr()->get_size(0);
+	  
+	memcpy(data_ptr, r_ptr, samples_per_readout_*sizeof(float_complext));
+      }
+
+      // In sliding window mode the readout might need to go back at the end of the queue
+      // 
+      
+      long readouts_in_sliding_window = sliding_window_readouts_ + 
+	readouts_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*sliding_window_rotations_;
+
+      if( sliding_window && p >= (readouts_buffered-readouts_in_sliding_window) )
+	queue->enqueue_tail(mbq);
+      else
+	mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  boost::shared_ptr< hoNDArray<float> > 
+  gpuGenericSensePrepGadget::extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+							     bool sliding_window, unsigned int set, unsigned int slice )
+  {    
+    if(!queue) {
+      GADGET_DEBUG1("Illegal queue pointer, cannot extract trajectory\n");
+      throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: illegal queue pointer");	
+    }
+
+    if(queue->message_count()==0) {
+      GADGET_DEBUG1("Empty queue, cannot extract trajectory\n");
+      throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: empty queue");	
+    }
+
+    if(samples_per_readout_ < 1) {
+      GADGET_DEBUG2("Empty queue (%d), cannot extract trajectory\n", samples_per_readout_);
+      throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: empty queue");	
+    }
+    
+    unsigned int readouts_buffered = queue->message_count();
+    
+    std::vector<size_t> dims;
+    dims.push_back(3);
+    dims.push_back(samples_per_readout_);
+    dims.push_back(readouts_buffered);
+    
+    boost::shared_ptr< hoNDArray<float> > host_samples(new hoNDArray<float>(&dims));
+    
+    for (unsigned int p=0; p<readouts_buffered; p++) {      
+      ACE_Message_Block* mbq;
+      if (queue->dequeue_head(mbq) < 0) {
+	GADGET_DEBUG1("Message dequeue failed\n");
+	throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: dequeing failed");	
+      }
+      
+      GadgetContainerMessage< hoNDArray<float> > *daq = AsContainerMessage<hoNDArray<float> >(mbq);
+	
+      if (!daq) {
+	GADGET_DEBUG1("Unable to interpret data on message queue\n");
+	throw std::runtime_error("gpuGenericSensePrepGadget::extract_trajectory_from_queue: failed to interpret data");	
+      }
+
+      float *data_ptr = host_samples->get_data_ptr();
+      data_ptr += 3*samples_per_readout_*p;
+      
+      float *r_ptr = daq->getObjectPtr()->get_data_ptr();
+      
+      memcpy(data_ptr, r_ptr, 3*samples_per_readout_*sizeof(float));
+      
+      // In sliding window mode the readout might need to go back at the end of the queue
+      // 
+      
+      long readouts_in_sliding_window = sliding_window_readouts_ + 
+	readouts_per_frame_[set*slices_+slice]*frames_per_rotation_[set*slices_+slice]*sliding_window_rotations_;
+
+      if( sliding_window && p >= (readouts_buffered-readouts_in_sliding_window) )
+	queue->enqueue_tail(mbq);
+      else
+	mbq->release();
+    } 
+    
+    return host_samples;
+  }
+  
+  void gpuGenericSensePrepGadget::extract_trajectory_and_dcw_from_queue
+  ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, bool sliding_window, unsigned int set, unsigned int slice, 
+    unsigned int samples_per_frame, unsigned int num_frames,
+    cuNDArray<floatd2> *traj, cuNDArray<float> *dcw )
+  {
+    // Extract trajectory and dcw.
+    // They are stored as a float array of dimensions: 3 x #samples_per_readout x #readouts.
+    // We need
+    // - a floatd2 trajectory array 
+    // - a float dcw array 
+    //
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj_dcw =
+      extract_trajectory_from_queue( queue, sliding_window, set, slice );
+    
+    std::vector<size_t> order;
+    order.push_back(1); order.push_back(2); order.push_back(0);
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj_dcw_shifted =
+      permute( host_traj_dcw.get(), &order );
+    
+    std::vector<size_t> dims_1d;
+    dims_1d.push_back(host_traj_dcw_shifted->get_size(0)*host_traj_dcw_shifted->get_size(1));
+    
+    {
+      hoNDArray<float> tmp(&dims_1d, host_traj_dcw_shifted->get_data_ptr()+2*dims_1d[0]);
+      *dcw = tmp;
+    }
+    
+    std::vector<size_t> dims_2d = dims_1d;
+    dims_2d.push_back(2);
+    
+    order.clear();
+    order.push_back(1); order.push_back(0);
+
+    hoNDArray<float> tmp(&dims_2d, host_traj_dcw_shifted->get_data_ptr());
+    cuNDArray<float> __traj(&tmp);
+    boost::shared_ptr< cuNDArray<float> > _traj = permute( &__traj, &order );
+    
+    cuNDArray<floatd2> tmp2(&dims_1d, (floatd2*)_traj->get_data_ptr());
+    
+    *traj = tmp2;
+    
+    unsigned int idx = set*slices_+slice;
+    dims_2d.clear();
+
+    dims_2d.push_back(samples_per_frame);
+    dims_2d.push_back(num_frames);
+
+    dcw->reshape(&dims_2d);
+    traj->reshape(&dims_2d);
+  }
+
+  template<class T> GadgetContainerMessage< hoNDArray<T> >*
+  gpuGenericSensePrepGadget::duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array )
+  {
+    GadgetContainerMessage< hoNDArray<T> > *copy = new GadgetContainerMessage< hoNDArray<T> >();   
+    *(copy->getObjectPtr()) = *(array->getObjectPtr());
+    return copy;
+  }
+
+  void gpuGenericSensePrepGadget::reconfigure(unsigned int set, unsigned int slice)
+  {    
+    unsigned int idx = set*slices_+slice;
+    
+    GADGET_DEBUG2("\nReconfiguring:\n#readouts/frame:%d\n#frames/rotation: %d\n#rotations/reconstruction:%d\n", 
+		  readouts_per_frame_[idx], frames_per_rotation_[idx], rotations_per_reconstruction_);
+    
+    buffer_frames_per_rotation_[idx] = get_int_value(std::string("buffer_frames_per_rotation").c_str());
+    
+    if( buffer_frames_per_rotation_[idx] == 0 ){
+      buffer_frames_per_rotation_[idx] = frames_per_rotation_[idx];
+    }
+    
+    if( get_int_value(std::string("buffer_length_in_rotations").c_str()) == 0 ){
+      buffer_length_in_rotations_ = std::max(1L, rotations_per_reconstruction_);
+    }
+
+    cuSenseBuffer<float,2> *acc_buffer = 
+      (buffer_using_solver_) ? &acc_buffer_cg_[idx] : &acc_buffer_[idx];
+    
+    if( buffer_frames_per_rotation_[idx] == 1 ){ // Is this general enough to detect golden ratio type trajectories?
+
+      acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+			 kernel_width_, num_coils_[idx], 1, buffer_length_in_rotations_ );
+    }else{
+      acc_buffer->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, 
+			 kernel_width_, num_coils_[idx], buffer_length_in_rotations_, buffer_frames_per_rotation_[idx] );
+    }
+    reconfigure_[idx] = false;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuGenericSensePrepGadget)
+}
diff --git a/gadgets/sense/gpuGenericSensePrepGadget.h b/gadgets/sense/gpuGenericSensePrepGadget.h
new file mode 100644
index 0000000..28a8401
--- /dev/null
+++ b/gadgets/sense/gpuGenericSensePrepGadget.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "gadgetron_gpusense_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+#include "cuCgPreconditioner.h"
+#include "cuSenseBufferCg.h"
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUSENSE gpuGenericSensePrepGadget :
+    public Gadget3< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> >, hoNDArray<float> >
+  {
+    
+  public:
+    GADGET_DECLARE(gpuGenericSensePrepGadget);
+
+    gpuGenericSensePrepGadget();
+    virtual ~gpuGenericSensePrepGadget();
+
+  protected:
+    
+    virtual int process_config(ACE_Message_Block *mb);
+
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
+			GadgetContainerMessage< hoNDArray<float> > *m3 );                // traj/dcw
+
+  private:
+
+    inline bool vec_equal(float *in1, float *in2) {
+      for (unsigned int i = 0; i < 3; i++) {
+	if (in1[i] != in2[i]) return false;
+      }
+      return true;
+    }
+    
+    boost::shared_array<bool> reconfigure_;
+    virtual void reconfigure(unsigned int set, unsigned int slice);
+
+    template<class T> GadgetContainerMessage< hoNDArray<T> >* 
+      duplicate_array( GadgetContainerMessage< hoNDArray<T> > *array );
+    
+    boost::shared_ptr< hoNDArray<float_complext> > 
+      extract_samples_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+				   bool sliding_window, unsigned int set, unsigned int slice );
+    
+    boost::shared_ptr< hoNDArray<float> > 
+      extract_trajectory_from_queue ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, 
+				      bool sliding_window, unsigned int set, unsigned int slice );
+      
+    void extract_trajectory_and_dcw_from_queue
+      ( ACE_Message_Queue<ACE_MT_SYNCH> *queue, bool sliding_window, unsigned int set, unsigned int slice, 
+	unsigned int samples_per_frame, unsigned int num_frames,
+	cuNDArray<floatd2> *traj, cuNDArray<float> *dcw );
+    
+    int slices_;
+    int sets_;
+    int device_number_;
+    long samples_per_readout_;
+
+    boost::shared_array<long> image_counter_;
+    boost::shared_array<long> readouts_per_frame_;  // for an undersampled frame
+    boost::shared_array<long> frames_per_rotation_; // representing a fully sampled frame
+
+    // The number of rotations to batch per reconstruction. 
+    // Set to '0' to reconstruct frames individually.
+    long rotations_per_reconstruction_; 
+
+    // The number of buffer cycles
+    long buffer_length_in_rotations_; 
+
+    boost::shared_array<long> buffer_frames_per_rotation_; // the number of buffer subcycles
+
+    // Internal book-keping
+    boost::shared_array<long> previous_readout_no_;
+    boost::shared_array<long> acceleration_factor_;
+    boost::shared_array<long> readout_counter_frame_;
+    boost::shared_array<long> readout_counter_global_;
+
+    long sliding_window_readouts_;
+    long sliding_window_rotations_;
+
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_array<unsigned int> num_coils_;
+
+    boost::shared_array<float[3]> position_;
+    boost::shared_array<float[3]> read_dir_;
+    boost::shared_array<float[3]> phase_dir_;
+    boost::shared_array<float[3]> slice_dir_;
+
+    bool output_timing_;
+    bool buffer_using_solver_;
+
+    int propagate_csm_from_set_;
+    boost::shared_ptr< cuNDArray<float_complext> > csm_;
+
+    boost::shared_array<bool> buffer_update_needed_;
+
+    boost::shared_array< hoNDArray<float_complext> > csm_host_;
+    boost::shared_array< hoNDArray<float_complext> > reg_host_;
+    
+    boost::shared_array< cuSenseBuffer<float,2> > acc_buffer_;
+    boost::shared_array< cuSenseBufferCg<float,2> > acc_buffer_cg_;
+
+    std::vector<size_t> fov_;
+    std::vector<size_t> image_dimensions_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > frame_readout_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_readout_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > frame_traj_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > recon_traj_queue_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > image_headers_queue_;
+  };
+}
diff --git a/gadgets/sense/gpuSbSenseGadget.cpp b/gadgets/sense/gpuSbSenseGadget.cpp
new file mode 100644
index 0000000..39e6d3e
--- /dev/null
+++ b/gadgets/sense/gpuSbSenseGadget.cpp
@@ -0,0 +1,426 @@
+#include "gpuSbSenseGadget.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "Gadgetron.h"
+#include "GadgetMRIHeaders.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+
+#include <boost/thread/mutex.hpp>
+
+namespace Gadgetron{
+
+#define max_number_of_gpus 10
+  static boost::mutex _mutex[max_number_of_gpus];
+  
+  gpuSbSenseGadget::gpuSbSenseGadget()
+    : is_configured_(false)
+    , prepared_(false)
+    , channels_(0)
+    , frame_counter_(0)
+  {
+    set_parameter(std::string("deviceno").c_str(), "0");
+    set_parameter(std::string("setno").c_str(), "0");
+    set_parameter(std::string("sliceno").c_str(), "0");
+    set_parameter(std::string("number_of_sb_iterations").c_str(), "20");
+    set_parameter(std::string("number_of_cg_iterations").c_str(), "10");
+    set_parameter(std::string("cg_limit").c_str(), "1e-6");
+    set_parameter(std::string("oversampling_factor").c_str(), "1.5");
+    set_parameter(std::string("kernel_width").c_str(), "5.5");
+    set_parameter(std::string("mu").c_str(), "1.0");
+    set_parameter(std::string("lambda").c_str(), "2.0");
+    set_parameter(std::string("alpha").c_str(), "0.5");
+    set_parameter(std::string("exclusive_access").c_str(), "false");
+
+    matrix_size_ = uint64d2(0,0);
+    matrix_size_os_ = uint64d2(0,0);
+    matrix_size_seq_ = uint64d2(0,0);
+  }
+
+  gpuSbSenseGadget::~gpuSbSenseGadget() {}
+
+  int gpuSbSenseGadget::process_config( ACE_Message_Block* mb )
+  {
+    GADGET_DEBUG1("gpuSbSenseGadget::process_config\n");
+
+    device_number_ = get_int_value(std::string("deviceno").c_str());
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GADGET_DEBUG1( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (device_number_ >= number_of_devices) {
+      GADGET_DEBUG2("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    pass_on_undesired_data_ = get_bool_value(std::string("pass_on_undesired_data").c_str());
+    set_number_ = get_int_value(std::string("setno").c_str());
+    slice_number_ = get_int_value(std::string("sliceno").c_str());
+    number_of_sb_iterations_ = get_int_value(std::string("number_of_sb_iterations").c_str());
+    number_of_cg_iterations_ = get_int_value(std::string("number_of_cg_iterations").c_str());
+    cg_limit_ = get_double_value(std::string("cg_limit").c_str());
+    oversampling_factor_ = get_double_value(std::string("oversampling_factor").c_str());
+    kernel_width_ = get_double_value(std::string("kernel_width").c_str());
+    mu_ = get_double_value(std::string("mu").c_str());
+    lambda_ = get_double_value(std::string("lambda").c_str());
+    alpha_ = get_double_value(std::string("alpha").c_str());
+    rotations_to_discard_ = get_int_value(std::string("rotations_to_discard").c_str());
+    output_convergence_ = get_bool_value(std::string("output_convergence").c_str());
+    exclusive_access_ = get_bool_value(std::string("exclusive_access").c_str());
+
+    if( (rotations_to_discard_%2) == 1 ){
+      GADGET_DEBUG1("#rotations to discard must be even.\n");
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    std::vector<long> dims;
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+    
+    //ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    //ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    matrix_size_seq_ = uint64d2( r_space.matrixSize().x(), r_space.matrixSize().y() );
+
+    if (!is_configured_) {
+
+      channels_ = cfg->acquisitionSystemInformation().present() ?
+	(cfg->acquisitionSystemInformation().get().receiverChannels().present() ? cfg->acquisitionSystemInformation().get().receiverChannels().get() : 1) : 1;
+     
+      // Allocate encoding operator for non-Cartesian Sense
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >( new cuNonCartesianSenseOperator<float,2>() );
+      E_->set_weight(mu_);
+
+      // Allocate preconditioner
+      D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+
+      Rx1_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+	( new cuPartialDerivativeOperator<float_complext,3>(0) );
+      Rx1_->set_weight( (1.0-alpha_)*lambda_ );
+
+      Ry1_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+	( new cuPartialDerivativeOperator<float_complext,3>(1) );
+      Ry1_->set_weight( (1.0-alpha_)*lambda_ );
+
+      Rz1_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+	( new cuPartialDerivativeOperator<float_complext,3>(2) );
+      Rz1_->set_weight( (1.0-alpha_)*lambda_ );
+
+      Rx2_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+	( new cuPartialDerivativeOperator<float_complext,3>(0) );
+      Rx2_->set_weight( alpha_*lambda_ );
+
+      Ry2_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+	( new cuPartialDerivativeOperator<float_complext,3>(1) );
+      Ry2_->set_weight( alpha_*lambda_ );
+
+      Rz2_ = boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> >
+	( new cuPartialDerivativeOperator<float_complext,3>(2) );
+      Rz2_->set_weight( alpha_*lambda_ );
+
+      // Setup split-Bregman solver
+      sb_.set_encoding_operator( E_ );
+            
+      sb_.set_max_outer_iterations(number_of_sb_iterations_);
+      sb_.set_max_inner_iterations(1);
+      sb_.set_output_mode( (output_convergence_) ? cuSbcCgSolver<float_complext>::OUTPUT_VERBOSE : cuSbcCgSolver<float_complext>::OUTPUT_SILENT );
+      
+      sb_.get_inner_solver()->set_max_iterations( number_of_cg_iterations_ );
+      sb_.get_inner_solver()->set_tc_tolerance( cg_limit_ );
+      sb_.get_inner_solver()->set_output_mode( (output_convergence_) ? cuCgSolver<float_complext>::OUTPUT_VERBOSE : cuCgSolver<float_complext>::OUTPUT_SILENT );
+      sb_.get_inner_solver()->set_preconditioner( D_ );
+
+      is_configured_ = true;
+    }
+
+    GADGET_DEBUG1("gpuSbSenseGadget::end of process_config\n");
+
+    return GADGET_OK;
+  }
+
+  int gpuSbSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<SenseJob> *m2)
+  {
+    // Is this data for this gadget's set/slice?
+    //
+    
+    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
+      // No, pass it downstream...
+      return this->next()->putq(m1);
+    }
+
+    //GADGET_DEBUG1("gpuSbSenseGadget::process\n");
+    //GPUTimer timer("gpuSbSenseGadget::process");
+
+    if (!is_configured_) {
+      GADGET_DEBUG1("\nData received before configuration complete\n");
+      return GADGET_FAIL;
+    }
+
+    SenseJob* j = m2->getObjectPtr();
+
+    // Let's first check that this job has the required data...
+    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
+      GADGET_DEBUG1("Received an incomplete Sense job\n");
+      return GADGET_FAIL;
+    }
+
+    unsigned int samples = j->dat_host_->get_size(0);
+    unsigned int channels = j->dat_host_->get_size(1);
+    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
+    unsigned int frames = j->tra_host_->get_size(1)*rotations;
+
+    if( samples%j->tra_host_->get_number_of_elements() ) {
+      GADGET_DEBUG2("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n", 
+		    samples, j->tra_host_->get_number_of_elements());
+      return GADGET_FAIL;
+    }
+
+    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
+    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
+    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
+    
+    if( !prepared_){
+
+      // Take the reconstruction matrix size from the regulariaztion image. 
+      // It could be oversampled from the sequence specified size...
+      
+      matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );
+      
+      cudaDeviceProp deviceProp;
+      if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+	GADGET_DEBUG1( "\nError: unable to query device properties.\n" );
+	return GADGET_FAIL;
+      }
+
+      unsigned int warp_size = deviceProp.warpSize;
+
+      matrix_size_os_ =
+	uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+	       ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+      
+      GADGET_DEBUG2("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
+      GADGET_DEBUG2("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
+
+      std::vector<size_t> image_dims = to_std_vector(matrix_size_);
+      image_dims.push_back(frames);
+      
+      E_->set_domain_dimensions(&image_dims);
+      E_->set_codomain_dimensions(device_samples->get_dimensions().get());
+            
+      reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims));
+      
+      // These operators need their domain/codomain set before being added to the solver
+      //
+
+      Rx1_->set_domain_dimensions(&image_dims);
+      Rx1_->set_codomain_dimensions(&image_dims);
+      
+      Ry1_->set_domain_dimensions(&image_dims);
+      Ry1_->set_codomain_dimensions(&image_dims);
+      
+      Rz1_->set_domain_dimensions(&image_dims);
+      Rz1_->set_codomain_dimensions(&image_dims);
+      
+      Rx2_->set_domain_dimensions(&image_dims);
+      Rx2_->set_codomain_dimensions(&image_dims);
+      
+      Ry2_->set_domain_dimensions(&image_dims);
+      Ry2_->set_codomain_dimensions(&image_dims);
+      
+      Rz2_->set_domain_dimensions(&image_dims);
+      Rz2_->set_codomain_dimensions(&image_dims);
+      
+      // Add "TV" regularization
+      // 
+      
+      if( alpha_<1.0 ){
+	sb_.add_regularization_group_operator( Rx1_ ); 
+	sb_.add_regularization_group_operator( Ry1_ ); 
+	if(frames>1)
+	  sb_.add_regularization_group_operator( Rz1_ ); 
+	sb_.add_group();
+      }
+      
+      // Add "PICCS" regularization
+      //
+
+      if( alpha_ > 0.0 ){
+	sb_.add_regularization_group_operator( Rx2_ ); 
+	sb_.add_regularization_group_operator( Ry2_ ); 
+	if(frames>1)
+	  sb_.add_regularization_group_operator( Rz2_ ); 
+	sb_.add_group(reg_image_);
+      }
+      
+      prepared_ = true;
+    }
+    
+    E_->set_dcw(dcw);
+    E_->set_csm(csm);    
+    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
+    E_->preprocess(traj.get());
+
+    // Expand the average image to the number of frames
+    //
+
+    {
+      cuNDArray<float_complext> tmp(*j->reg_host_);
+      *reg_image_ = *expand( &tmp, frames );
+    }
+
+    // Define preconditioning weights
+    //
+
+    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights );
+    precon_weights.reset();
+    
+    // Invoke solver
+    //
+
+    boost::shared_ptr< cuNDArray<float_complext> > sbresult;
+    {
+      GADGET_DEBUG1("Running split Bregman solver\n");
+      GPUTimer timer("Running split Bregman solver");
+
+      // Optionally, allow exclusive (per device) access to the solver
+      // This may not matter much in terms of speed, but it can in terms of memory consumption
+      //
+
+      if( exclusive_access_ )
+	_mutex[device_number_].lock();
+
+      sbresult = sb_.solve(device_samples.get());
+
+      if( exclusive_access_ )
+	_mutex[device_number_].unlock();
+    }
+
+    // Provide some info about the scaling between the regularization and reconstruction.
+    // If it is not close to one, PICCS does not work optimally...
+    // 
+
+    if( alpha_ > 0.0 ){
+      cuNDArray<float_complext> gpureg(j->reg_host_.get());
+      boost::shared_ptr< cuNDArray<float_complext> > gpurec = sum(sbresult.get(),2);
+      *gpurec /= float(sbresult->get_size(2));
+      float scale = abs(dot(gpurec.get(), gpurec.get())/dot(gpurec.get(),&gpureg));
+      GADGET_DEBUG2("Scaling factor between regularization and reconstruction is %f.\n", scale);
+    }
+    
+    if (!sbresult.get()) {
+      GADGET_DEBUG1("\nSplit Bregman solver failed\n");
+      return GADGET_FAIL;
+    }
+    
+    /*
+    static int counter = 0;
+    char filename[256];
+    sprintf((char*)filename, "recon_sb_%d.cplx", counter);
+    write_nd_array<float_complext>( sbresult->to_host().get(), filename );
+    counter++; */
+
+    // If the recon matrix size exceeds the sequence matrix size then crop
+    if( matrix_size_seq_ != matrix_size_ )
+      sbresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, sbresult.get() );
+        
+    // Now pass on the reconstructed images
+    //
+
+    unsigned int frames_per_rotation = frames/rotations;
+
+    if( rotations == 1 ){ // this is the case for golden ratio
+      rotations = frames;
+      frames_per_rotation = 1;
+    }
+
+    for( unsigned int frame=0; frame<frames; frame++ ){
+      
+      unsigned int rotation_idx = frame/frames_per_rotation;
+
+      // Check if we should discard this frame
+      if( rotation_idx < (rotations_to_discard_>>1) || rotation_idx >= rotations-(rotations_to_discard_>>1) )
+	continue;
+
+      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm = 
+	new GadgetContainerMessage< hoNDArray< std::complex<float> > >();     
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *m = 
+	new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+      *m->getObjectPtr() = j->image_headers_[frame];
+      m->getObjectPtr()->matrix_size[0] = matrix_size_seq_[0];
+      m->getObjectPtr()->matrix_size[1] = matrix_size_seq_[1];      
+      m->cont(cm);
+      
+      std::vector<size_t> img_dims(2);
+      img_dims[0] = matrix_size_seq_[0];
+      img_dims[1] = matrix_size_seq_[1];
+
+      cm->getObjectPtr()->create(&img_dims);
+
+      size_t data_length = prod(matrix_size_seq_);
+
+      cudaMemcpy(cm->getObjectPtr()->get_data_ptr(),
+		 sbresult->get_data_ptr()+frame*data_length,
+		 data_length*sizeof(std::complex<float>),
+		 cudaMemcpyDeviceToHost);
+
+      cudaError_t err = cudaGetLastError();
+      if( err != cudaSuccess ){
+	GADGET_DEBUG2("\nUnable to copy result from device to host: %s", cudaGetErrorString(err));
+	m->release();
+	return GADGET_FAIL;
+      }
+
+      m->getObjectPtr()->matrix_size[0] = img_dims[0];
+      m->getObjectPtr()->matrix_size[1] = img_dims[1];
+      m->getObjectPtr()->matrix_size[2] = 1;
+      m->getObjectPtr()->channels       = 1;
+      m->getObjectPtr()->image_index    = frame_counter_ + frame;
+
+      if (this->next()->putq(m) < 0) {
+	GADGET_DEBUG1("\nFailed to result image on to Q\n");
+	m->release();
+	return GADGET_FAIL;
+      }
+    }
+
+    frame_counter_ += frames;
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuSbSenseGadget)
+}
+
diff --git a/gadgets/sense/gpuSbSenseGadget.h b/gadgets/sense/gpuSbSenseGadget.h
new file mode 100644
index 0000000..9355692
--- /dev/null
+++ b/gadgets/sense/gpuSbSenseGadget.h
@@ -0,0 +1,85 @@
+#ifndef gpuSbSenseGadget_H
+#define gpuSbSenseGadget_H
+#pragma once
+
+#include <ace/Synch.h>
+#include <ace/Mutex.h>
+
+#include "gadgetron_gpusense_export.h"
+#include "Gadget.h"
+#include "SenseJob.h"
+#include "GadgetMRIHeaders.h"
+#include "cuSbcCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuNFFT.h"
+#include "cuImageOperator.h"
+#include "ismrmrd.h"
+
+#include <complex>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_GPUSENSE gpuSbSenseGadget : public Gadget2< ISMRMRD::ImageHeader, SenseJob >
+  {
+
+  public:
+    GADGET_DECLARE(gpuSbSenseGadget);
+
+    gpuSbSenseGadget();
+    virtual ~gpuSbSenseGadget();
+
+  protected:
+
+    virtual int process( GadgetContainerMessage< ISMRMRD::ImageHeader >* m1, GadgetContainerMessage< SenseJob > * m2 );
+    virtual int process_config( ACE_Message_Block* mb );
+
+    int channels_;
+    int device_number_;
+    int set_number_;
+    int slice_number_;
+
+    uint64d2 matrix_size_;
+    uint64d2 matrix_size_os_;
+    uint64d2 matrix_size_seq_;
+
+    unsigned int number_of_cg_iterations_;
+    unsigned int number_of_sb_iterations_;
+    double cg_limit_;
+    double oversampling_factor_;
+    double kernel_width_;
+    double mu_;
+    double lambda_;
+    double alpha_;
+    unsigned int rotations_to_discard_;
+
+    bool output_convergence_;
+    bool exclusive_access_;
+    bool is_configured_;
+    bool prepared_;
+
+    // Define constraint Split Bregman solver
+    cuSbcCgSolver<float_complext> sb_;
+
+    // Define non-Cartesian Sense Encofing operator
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+
+    // Define preconditioner
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    // Average image for regularization
+    boost::shared_ptr< cuNDArray<float_complext> > reg_image_;
+
+    // Define regularization operators
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rx1_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rx2_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Ry1_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Ry2_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rz1_;
+    boost::shared_ptr< cuPartialDerivativeOperator<float_complext,3> > Rz2_;
+	
+    int frame_counter_;
+  };
+}
+#endif //gpuSbSenseGadget
diff --git a/gadgets/spiral/CMakeLists.txt b/gadgets/spiral/CMakeLists.txt
new file mode 100644
index 0000000..32bd768
--- /dev/null
+++ b/gadgets/spiral/CMakeLists.txt
@@ -0,0 +1,39 @@
+IF (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_SPIRAL__)
+ENDIF (WIN32)
+
+find_package(Ismrmrd REQUIRED)
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/gadgets/mri_core
+  ${CMAKE_SOURCE_DIR}/gadgets/sense
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${ISMRMRD_XSD_INCLUDE_DIR}
+  ${CUDA_INCLUDE_DIRS}
+  )
+
+add_library(gadgetron_spiral SHARED 
+  vds.cpp 
+  gpuSpiralSensePrepGadget.cpp 
+  SpiralToGenericGadget.cpp
+  ${ISMRMRD_XSD_SOURCE})
+
+target_link_libraries(gadgetron_spiral
+  cpucore gpucore gpunfft gpusolvers gpuoperators gpuparallelmri
+  ${ISMRMRD_LIBRARIES} ${XERCESC_LIBRARIES} ${FFTW3_LIBRARIES} ${CUDA_LIBRARIES}
+  optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY}
+  )
+
+install (TARGETS gadgetron_spiral DESTINATION lib)
+install (FILES vds.h DESTINATION include)
+
+add_subdirectory(config)
diff --git a/gadgets/spiral/SpiralToGenericGadget.cpp b/gadgets/spiral/SpiralToGenericGadget.cpp
new file mode 100644
index 0000000..831f127
--- /dev/null
+++ b/gadgets/spiral/SpiralToGenericGadget.cpp
@@ -0,0 +1,225 @@
+#include "SpiralToGenericGadget.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "vds.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace Gadgetron{
+
+  SpiralToGenericGadget::SpiralToGenericGadget()
+    : samples_to_skip_start_(0)
+    , samples_to_skip_end_(0)
+    , samples_per_interleave_(0)
+    , prepared_(false)
+  {
+  }
+
+  SpiralToGenericGadget::~SpiralToGenericGadget() {}
+
+  int SpiralToGenericGadget::process_config(ACE_Message_Block* mb)
+  {
+    // Start parsing the ISMRMRD XML header
+    //
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    if( cfg.get() == 0x0 ){
+      GADGET_DEBUG1("Unable to parse Ismrmrd header\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    //ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    //
+    // Setup the spiral trajectory
+    //
+
+    if (!(*e_seq.begin()).trajectoryDescription().present()) {
+      GADGET_DEBUG1("Trajectory description needed to calculate trajectory");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::trajectoryDescriptionType traj_desc = (*e_seq.begin()).trajectoryDescription().get();
+
+    if (std::strcmp(traj_desc.identifier().c_str(), "HargreavesVDS2000")) {
+      GADGET_DEBUG1("Expected trajectory description identifier 'HargreavesVDS2000', not found.");
+      return GADGET_FAIL;
+    }
+
+    long interleaves = -1;
+    long fov_coefficients = -1;
+    long sampling_time_ns = -1;
+    double max_grad = -1.0;
+    double max_slew = -1.0;
+    double fov_coeff = -1.0;
+    double kr_max = -1.0;
+
+    for (ISMRMRD::trajectoryDescriptionType::userParameterLong_sequence::iterator i (traj_desc.userParameterLong().begin ()); i != traj_desc.userParameterLong().end(); ++i) {
+      if (std::strcmp(i->name().c_str(),"interleaves") == 0) {
+	interleaves = i->value();
+      } else if (std::strcmp(i->name().c_str(),"fov_coefficients") == 0) {
+	fov_coefficients = i->value();
+      } else if (std::strcmp(i->name().c_str(),"SamplingTime_ns") == 0) {
+	sampling_time_ns = i->value();
+      } else {
+	GADGET_DEBUG2("WARNING: unused trajectory parameter %s found\n", i->name().c_str());
+      }
+    }
+
+    for (ISMRMRD::trajectoryDescriptionType::userParameterDouble_sequence::iterator i (traj_desc.userParameterDouble().begin ()); i != traj_desc.userParameterDouble().end(); ++i) {
+      if (std::strcmp(i->name().c_str(),"MaxGradient_G_per_cm") == 0) {
+	max_grad = i->value();
+      } else if (std::strcmp(i->name().c_str(),"MaxSlewRate_G_per_cm_per_s") == 0) {
+	max_slew = i->value();
+      } else if (std::strcmp(i->name().c_str(),"FOVCoeff_1_cm") == 0) {
+	fov_coeff = i->value();
+      } else if (std::strcmp(i->name().c_str(),"krmax_per_cm") == 0) {
+	kr_max= i->value();
+      } else {
+	GADGET_DEBUG2("WARNING: unused trajectory parameter %s found\n", i->name().c_str());
+      }
+    }
+
+    if ((interleaves < 0) || (fov_coefficients < 0) || (sampling_time_ns < 0) || (max_grad < 0) || (max_slew < 0) || (fov_coeff < 0) || (kr_max < 0)) {
+      GADGET_DEBUG1("Appropriate parameters for calculating spiral trajectory not found in XML configuration\n");
+      return GADGET_FAIL;
+    }
+
+    Tsamp_ns_ = sampling_time_ns;
+    Nints_ = interleaves;
+    interleaves_ = static_cast<int>(Nints_);
+
+    gmax_ = max_grad;
+    smax_ = max_slew;
+    krmax_ = kr_max;
+    fov_ = fov_coeff;
+
+    samples_to_skip_start_  =  0; //n.get<int>(std::string("samplestoskipstart.value"))[0];
+    samples_to_skip_end_    = -1; //n.get<int>(std::string("samplestoskipend.value"))[0];
+
+    GADGET_DEBUG2("smax:                    %f\n", smax_);
+    GADGET_DEBUG2("gmax:                    %f\n", gmax_);
+    GADGET_DEBUG2("Tsamp_ns:                %d\n", Tsamp_ns_);
+    GADGET_DEBUG2("Nints:                   %d\n", Nints_);
+    GADGET_DEBUG2("fov:                     %f\n", fov_);
+    GADGET_DEBUG2("krmax:                   %f\n", krmax_);
+    GADGET_DEBUG2("samples_to_skip_start_ : %d\n", samples_to_skip_start_);
+    GADGET_DEBUG2("samples_to_skip_end_   : %d\n", samples_to_skip_end_);
+
+    return GADGET_OK;
+  }
+
+  int SpiralToGenericGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+
+    // Compute hoNDArray of trajectory and weights at first pass
+    //
+
+    if (!prepared_) {
+
+      int     nfov   = 1;         /*  number of fov coefficients.             */
+      int     ngmax  = 1e5;       /*  maximum number of gradient samples      */
+      double  *xgrad;             /*  x-component of gradient.                */
+      double  *ygrad;             /*  y-component of gradient.                */
+      double  *x_trajectory;
+      double  *y_trajectory;
+      double  *weighting;
+      int     ngrad;
+      double sample_time = (1.0*Tsamp_ns_) * 1e-9;
+
+      // Calculate gradients 
+      calc_vds(smax_,gmax_,sample_time,sample_time,Nints_,&fov_,nfov,krmax_,ngmax,&xgrad,&ygrad,&ngrad);
+
+      samples_per_interleave_ = std::min(ngrad,static_cast<int>(m1->getObjectPtr()->number_of_samples));
+      GADGET_DEBUG2("Using %d samples per interleave\n", samples_per_interleave_);
+
+      // Calculate the trajectory and weights
+      calc_traj(xgrad, ygrad, samples_per_interleave_, Nints_, sample_time, krmax_, &x_trajectory, &y_trajectory, &weighting);
+
+      std::vector<size_t> trajectory_dimensions;
+      trajectory_dimensions.push_back(3);
+      trajectory_dimensions.push_back(samples_per_interleave_*Nints_);
+
+      host_traj_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>(&trajectory_dimensions));
+
+      {
+	float* co_ptr = reinterpret_cast<float*>(host_traj_->get_data_ptr());
+	
+	for (int i = 0; i < (samples_per_interleave_*Nints_); i++) {
+	  co_ptr[i*3+0] = -x_trajectory[i]/2;
+	  co_ptr[i*3+1] = -y_trajectory[i]/2;
+	  co_ptr[i*3+2] = weighting[i];
+	}
+      }
+
+      delete [] xgrad;
+      delete [] ygrad;
+      delete [] x_trajectory;
+      delete [] y_trajectory;
+      delete [] weighting;
+
+      prepared_ = true;
+    }
+
+    // Adjustments based in the incoming data
+    //
+
+    if (samples_to_skip_end_ == -1) {
+      samples_to_skip_end_ = m1->getObjectPtr()->number_of_samples-samples_per_interleave_;
+      GADGET_DEBUG2("Adjusting samples_to_skip_end_ = %d\n", samples_to_skip_end_);
+    }
+
+    // Define some utility variables
+    //
+
+    unsigned int samples_to_copy = m1->getObjectPtr()->number_of_samples-samples_to_skip_end_;
+    unsigned int interleave = m1->getObjectPtr()->idx.kspace_encode_step_1;
+
+    // Prepare for a new array continuation for the trajectory/weights of the incoming profile
+    //
+
+    std::vector<size_t> trajectory_dimensions;
+    trajectory_dimensions.push_back(3);
+    trajectory_dimensions.push_back(samples_per_interleave_);
+    
+    hoNDArray<float> *traj_source = new hoNDArray<float>
+      (&trajectory_dimensions, host_traj_->get_data_ptr()+3*samples_per_interleave_*interleave);
+    
+    // Make a new array as continuation of m1, and pass along
+    //
+
+    GadgetContainerMessage< hoNDArray<float> > *cont = new GadgetContainerMessage< hoNDArray<float> >();
+    *(cont->getObjectPtr()) = *traj_source;
+    m2->cont(cont);
+    
+    if (this->next()->putq(m1) < 0) {
+      GADGET_DEBUG1("Failed to put job on queue.\n");
+      return GADGET_FAIL;
+    }
+    
+    return GADGET_OK;
+  }
+  
+  GADGET_FACTORY_DECLARE(SpiralToGenericGadget)
+}
diff --git a/gadgets/spiral/SpiralToGenericGadget.h b/gadgets/spiral/SpiralToGenericGadget.h
new file mode 100644
index 0000000..27764ab
--- /dev/null
+++ b/gadgets/spiral/SpiralToGenericGadget.h
@@ -0,0 +1,50 @@
+#ifndef SpiralToGenericGadget_H
+#define SpiralToGenericGadget_H
+#pragma once
+
+#include "gadgetron_spiral_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_SPIRAL SpiralToGenericGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(SpiralToGenericGadget);
+
+    SpiralToGenericGadget();
+    virtual ~SpiralToGenericGadget();
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+    
+  private:
+    int samples_to_skip_start_;
+    int samples_to_skip_end_;
+    int samples_per_interleave_;
+    int interleaves_;
+    long    Tsamp_ns_;
+    long    Nints_;
+    long    acceleration_factor_;
+    double  gmax_;
+    double  smax_;
+    double  krmax_;
+    double  fov_;
+    bool prepared_;
+    
+    boost::shared_ptr< hoNDArray<float> > host_traj_;
+  };
+}
+#endif //SpiralToGenericGadget_H
diff --git a/gadgets/spiral/config/CMakeLists.txt b/gadgets/spiral/config/CMakeLists.txt
new file mode 100644
index 0000000..5c3861f
--- /dev/null
+++ b/gadgets/spiral/config/CMakeLists.txt
@@ -0,0 +1,16 @@
+if (ARMADILLO_FOUND)
+  install (FILES 
+    spiral_flow_gpusense_cg.xml 
+    spiral_flow_gpusense_sb.xml 
+    spiral_flow_generic_gpusense_cg.xml 
+    spiral_flow_generic_gpusense_sb.xml 
+    spiral_interactive.xml 
+    DESTINATION config)
+elseif (ARMADILLO_FOUND)
+  MESSAGE("Armadillo not found, only unoptimized spiral config files will be available")
+endif (ARMADILLO_FOUND)
+
+install (FILES 
+  spiral_flow_gpusense_cg_unoptimized.xml 
+  spiral_flow_gpusense_sb_unoptimized.xml 
+  DESTINATION config)
diff --git a/gadgets/spiral/config/spiral_flow_generic_gpusense_cg.xml b/gadgets/spiral/config/spiral_flow_generic_gpusense_cg.xml
new file mode 100644
index 0000000..7a5ae1b
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_generic_gpusense_cg.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>SpiralToGenericGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>SpiralToGenericGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>16</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.1</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_generic_gpusense_sb.xml b/gadgets/spiral/config/spiral_flow_generic_gpusense_sb.xml
new file mode 100644
index 0000000..2313eef
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_generic_gpusense_sb.xml
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>SpiralToGenericGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>SpiralToGenericGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>gpuGenericSensePrepGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuGenericSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>rotations_per_reconstruction</name><value>16</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_using_solver</name><value>true</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>0.05</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+    <property><name>exclusive_access</name><value>true</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>20</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>0.05</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+    <property><name>exclusive_access</name><value>true</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_cg.xml b/gadgets/spiral/config/spiral_flow_gpusense_cg.xml
new file mode 100644
index 0000000..18bbb9e
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_cg.xml
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_cg_ecg.xml b/gadgets/spiral/config/spiral_flow_gpusense_cg_ecg.xml
new file mode 100644
index 0000000..09dd515
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_cg_ecg.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PhysioInterpolation</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PhysioInterpolationGadget</classname>
+    <property><name>phases</name><value>30</value></property>
+  </gadget>
+
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_cg_unoptimized.xml b/gadgets/spiral/config/spiral_flow_gpusense_cg_unoptimized.xml
new file mode 100644
index 0000000..b174cc2
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_cg_unoptimized.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+  
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget_unoptimized</classname>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+  </gadget>
+  
+    <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_sb.xml b/gadgets/spiral/config/spiral_flow_gpusense_sb.xml
new file mode 100644
index 0000000..61e63bf
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_sb.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>16</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+    <property><name>propagate_csm_from_set</name><value>0</value></property>
+    <property><name>buffer_using_solver</name><value>true</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>5.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+    <property><name>reconstruction_os_factor_x</name><value>1.5</value></property>
+    <property><name>reconstruction_os_factor_y</name><value>1.5</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+    <property><name>output_convergence</name><value>true</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+  
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_flow_gpusense_sb_unoptimized.xml b/gadgets/spiral/config/spiral_flow_gpusense_sb_unoptimized.xml
new file mode 100644
index 0000000..cbcad54
--- /dev/null
+++ b/gadgets/spiral/config/spiral_flow_gpusense_sb_unoptimized.xml
@@ -0,0 +1,129 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget_unoptimized</classname>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>use_multiframe_grouping</name><value>true</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSbSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuSbSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_cg_iterations</name> <value>10</value></property>
+    <property><name>number_of_sb_iterations</name> <value>10</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>5.5</value></property>
+    <property><name>mu</name>                      <value>1.0</value></property>
+    <property><name>lambda</name>                  <value>2.0</value></property>
+    <property><name>alpha</name>                   <value>0.5</value></property>
+  </gadget>
+
+  <gadget>
+    <name>PhaseSubtraction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FlowPhaseSubtractionGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>MaxwellCorrection</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>MaxwellCorrectionGadget</classname>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+    <property><name>extract_mask</name><value>9</value></property>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/config/spiral_interactive.xml b/gadgets/spiral/config/spiral_interactive.xml
new file mode 100644
index 0000000..2c138fd
--- /dev/null
+++ b/gadgets/spiral/config/spiral_interactive.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+			      xmlns="http://gadgetron.sf.net/gadgetron"
+			      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  
+  <reader>
+    <slot>1008</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+  </reader>
+  
+  <writer>
+    <slot>1004</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterCPLX</classname>
+  </writer>
+  <writer>
+    <slot>1005</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterFLOAT</classname>
+  </writer>
+  <writer>
+    <slot>1006</slot>
+    <dll>gadgetron_mricore</dll>
+    <classname>MRIImageWriterUSHORT</classname>
+  </writer>
+
+  <gadget>
+    <name>NoiseAdjust</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>NoiseAdjustGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>PCA</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>PCACoilGadget</classname>
+  </gadget>
+  
+  <gadget>
+    <name>CoilReduction</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>CoilReductionGadget</classname>
+    <property><name>coils_out</name><value>8</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuSpiralSensePrepGadget</name>
+    <dll>gadgetron_spiral</dll>
+    <classname>gpuSpiralSensePrepGadget</classname>
+    <property><name>deviceno</name><value>0</value></property>
+    <property><name>buffer_convolution_kernel_width</name><value>4.5</value></property>
+    <property><name>buffer_convolution_oversampling_factor</name><value>1.25</value></property>
+  </gadget>
+  
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>0</value></property>
+    <property><name>number_of_iterations</name>    <value>5</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>4.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>gpuCgSenseGadget</name>
+    <dll>gadgetron_gpusense</dll>
+    <classname>gpuCgSenseGadget</classname>
+    <property><name>pass_on_undesired_data</name>  <value>true</value></property>
+    <property><name>deviceno</name>                <value>0</value></property>
+    <property><name>setno</name>                   <value>1</value></property>
+    <property><name>number_of_iterations</name>    <value>5</value></property>
+    <property><name>cg_limit</name>                <value>1e-6</value></property>
+    <property><name>oversampling_factor</name>     <value>1.25</value></property>
+    <property><name>kernel_width</name>            <value>4.5</value></property>
+    <property><name>kappa</name>                   <value>0.3</value></property>
+  </gadget>
+
+  <gadget>
+    <name>Extract</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ExtractGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageWrite</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageWriterGadgetFLOAT</classname>
+      </gadget>
+  -->    
+  
+  <gadget>
+    <name>AutoScaleGadget</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>AutoScaleGadget</classname>
+  </gadget> 
+
+  <gadget>
+    <name>FloatToShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>FloatToUShortGadget</classname>
+  </gadget>
+  
+  <!--
+      <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+      </gadget>
+  -->
+  
+  <gadget>
+    <name>ImageFinishShort</name>
+    <dll>gadgetron_mricore</dll>
+    <classname>ImageFinishGadgetUSHORT</classname>
+  </gadget>    
+
+</gadgetronStreamConfiguration>
diff --git a/gadgets/spiral/gadgetron_spiral_export.h b/gadgets/spiral/gadgetron_spiral_export.h
new file mode 100644
index 0000000..5052304
--- /dev/null
+++ b/gadgets/spiral/gadgetron_spiral_export.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#if defined (WIN32)
+#ifdef __BUILD_GADGETRON_SPIRAL__
+#define EXPORTGADGETS_SPIRAL __declspec(dllexport)
+#else
+#define EXPORTGADGETS_SPIRAL __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETS_SPIRAL
+#endif
diff --git a/gadgets/spiral/gpuSpiralSensePrepGadget.cpp b/gadgets/spiral/gpuSpiralSensePrepGadget.cpp
new file mode 100644
index 0000000..852670d
--- /dev/null
+++ b/gadgets/spiral/gpuSpiralSensePrepGadget.cpp
@@ -0,0 +1,700 @@
+#include "gpuSpiralSensePrepGadget.h"
+#include "SenseJob.h"
+#include "Gadgetron.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "vector_td_utilities.h"
+#include "hoNDArray_fileio.h"
+#include "vector_td.h"
+#include "vector_td_operators.h"
+#include "check_CUDA.h"
+#include "b1_map.h"
+#include "GPUTimer.h"
+#include "GadgetIsmrmrdReadWrite.h"
+#include "vds.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace Gadgetron{
+
+  gpuSpiralSensePrepGadget::gpuSpiralSensePrepGadget()
+    : samples_to_skip_start_(0)
+    , samples_to_skip_end_(0)
+    , samples_per_interleave_(0)
+    , prepared_(false)
+    , use_multiframe_grouping_(false)
+    , acceleration_factor_(0)
+  {
+    GADGET_DEBUG1("Initializing Spiral\n");
+    set_parameter(std::string("buffer_using_solver").c_str(), "false");
+    set_parameter(std::string("propagate_csm_from_set").c_str(), "-1");
+    set_parameter(std::string("buffer_convolution_kernel_width").c_str(), "5.5");
+    set_parameter(std::string("buffer_convolution_oversampling_factor").c_str(), "1.25");
+    set_parameter(std::string("reconstruction_os_factor_x").c_str(), "1.0");
+    set_parameter(std::string("reconstruction_os_factor_y").c_str(), "1.0");
+  }
+
+  gpuSpiralSensePrepGadget::~gpuSpiralSensePrepGadget() {}
+
+  int gpuSpiralSensePrepGadget::process_config(ACE_Message_Block* mb)
+  {
+
+    int number_of_devices = 0;
+    if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query number of CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    if (number_of_devices == 0) {
+      GADGET_DEBUG1( "Error: No available CUDA devices.\n" );
+      return GADGET_FAIL;
+    }
+
+    device_number_ = get_int_value(std::string("deviceno").c_str());
+
+    if (device_number_ >= number_of_devices) {
+      GADGET_DEBUG2("Adjusting device number from %d to %d\n", device_number_,  (device_number_%number_of_devices));
+      device_number_ = (device_number_%number_of_devices);
+    }
+
+    if (cudaSetDevice(device_number_)!= cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to set CUDA device.\n" );
+      return GADGET_FAIL;
+    }
+
+    cudaDeviceProp deviceProp;
+    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
+      GADGET_DEBUG1( "Error: unable to query device properties.\n" );
+      return GADGET_FAIL;
+    }
+    
+    unsigned int warp_size = deviceProp.warpSize;
+
+    propagate_csm_from_set_ = get_int_value(std::string("propagate_csm_from_set").c_str());
+
+    if( propagate_csm_from_set_ > 0 ){
+      GADGET_DEBUG2("Currently, only set 0 can propagate coil sensitivity maps. Set %d was specified.\n", propagate_csm_from_set_ );
+      return GADGET_FAIL;
+    }
+
+    if( propagate_csm_from_set_ >= 0 ){
+      GADGET_DEBUG2("Propagating csm from set %d to all sets\n", propagate_csm_from_set_ );
+    }
+
+    buffer_using_solver_ = get_bool_value(std::string("buffer_using_solver").c_str());
+    use_multiframe_grouping_ = get_bool_value(std::string("use_multiframe_grouping").c_str());
+
+    if( buffer_using_solver_ && !use_multiframe_grouping_ ){
+      GADGET_DEBUG1("Enabling 'buffer_using_solver' requires also enabling 'use_multiframe_grouping'.\n" );
+      return GADGET_FAIL;
+    }
+
+    // Start parsing the ISMRMRD XML header
+    //
+
+    boost::shared_ptr<ISMRMRD::ismrmrdHeader> cfg = parseIsmrmrdXMLHeader(std::string(mb->rd_ptr()));
+
+    if( cfg.get() == 0x0 ){
+      GADGET_DEBUG1("Unable to parse Ismrmrd header\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::ismrmrdHeader::encoding_sequence e_seq = cfg->encoding();
+
+    if (e_seq.size() != 1) {
+      GADGET_DEBUG2("Number of encoding spaces: %d\n", e_seq.size());
+      GADGET_DEBUG1("This Gadget only supports one encoding space\n");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::encodingSpaceType e_space = (*e_seq.begin()).encodedSpace();
+    ISMRMRD::encodingSpaceType r_space = (*e_seq.begin()).reconSpace();
+    ISMRMRD::encodingLimitsType e_limits = (*e_seq.begin()).encodingLimits();
+
+    // Determine reconstruction matrix sizes
+    //
+
+    kernel_width_ = get_double_value(std::string("buffer_convolution_kernel_width").c_str());
+    oversampling_factor_ = get_double_value(std::string("buffer_convolution_oversampling_factor").c_str());
+    
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize().x()*get_double_value(std::string("reconstruction_os_factor_x").c_str())))+warp_size-1)/warp_size)*warp_size);  
+    image_dimensions_recon_.push_back(((static_cast<unsigned int>(std::ceil(e_space.matrixSize().y()*get_double_value(std::string("reconstruction_os_factor_y").c_str())))+warp_size-1)/warp_size)*warp_size);
+      
+    image_dimensions_recon_os_ = uint64d2
+      (((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
+       ((static_cast<unsigned int>(std::ceil(image_dimensions_recon_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);
+    
+    // In case the warp_size constraint kicked in
+    oversampling_factor_ = float(image_dimensions_recon_os_[0])/float(image_dimensions_recon_[0]);
+    
+    //
+    // Setup the spiral trajectory
+    //
+
+    if (!(*e_seq.begin()).trajectoryDescription().present()) {
+      GADGET_DEBUG1("Trajectory description needed to calculate trajectory");
+      return GADGET_FAIL;
+    }
+
+    ISMRMRD::trajectoryDescriptionType traj_desc = (*e_seq.begin()).trajectoryDescription().get();
+
+    if (std::strcmp(traj_desc.identifier().c_str(), "HargreavesVDS2000")) {
+      GADGET_DEBUG1("Expected trajectory description identifier 'HargreavesVDS2000', not found.");
+      return GADGET_FAIL;
+    }
+
+    long interleaves = -1;
+    long fov_coefficients = -1;
+    long sampling_time_ns = -1;
+    double max_grad = -1.0;
+    double max_slew = -1.0;
+    double fov_coeff = -1.0;
+    double kr_max = -1.0;
+
+    for (ISMRMRD::trajectoryDescriptionType::userParameterLong_sequence::iterator i (traj_desc.userParameterLong().begin ()); i != traj_desc.userParameterLong().end(); ++i) {
+      if (std::strcmp(i->name().c_str(),"interleaves") == 0) {
+	interleaves = i->value();
+      } else if (std::strcmp(i->name().c_str(),"fov_coefficients") == 0) {
+	fov_coefficients = i->value();
+      } else if (std::strcmp(i->name().c_str(),"SamplingTime_ns") == 0) {
+	sampling_time_ns = i->value();
+      } else {
+	GADGET_DEBUG2("WARNING: unused trajectory parameter %s found\n", i->name().c_str());
+      }
+    }
+
+    for (ISMRMRD::trajectoryDescriptionType::userParameterDouble_sequence::iterator i (traj_desc.userParameterDouble().begin ()); i != traj_desc.userParameterDouble().end(); ++i) {
+      if (std::strcmp(i->name().c_str(),"MaxGradient_G_per_cm") == 0) {
+	max_grad = i->value();
+      } else if (std::strcmp(i->name().c_str(),"MaxSlewRate_G_per_cm_per_s") == 0) {
+	max_slew = i->value();
+      } else if (std::strcmp(i->name().c_str(),"FOVCoeff_1_cm") == 0) {
+	fov_coeff = i->value();
+      } else if (std::strcmp(i->name().c_str(),"krmax_per_cm") == 0) {
+	kr_max= i->value();
+      } else {
+	GADGET_DEBUG2("WARNING: unused trajectory parameter %s found\n", i->name().c_str());
+      }
+    }
+
+    if ((interleaves < 0) || (fov_coefficients < 0) || (sampling_time_ns < 0) || (max_grad < 0) || (max_slew < 0) || (fov_coeff < 0) || (kr_max < 0)) {
+      GADGET_DEBUG1("Appropriate parameters for calculating spiral trajectory not found in XML configuration\n");
+      return GADGET_FAIL;
+    }
+
+    Tsamp_ns_ = sampling_time_ns;
+    Nints_ = interleaves;
+    interleaves_ = static_cast<int>(Nints_);
+
+    gmax_ = max_grad;
+    smax_ = max_slew;
+    krmax_ = kr_max;
+    fov_ = fov_coeff;
+
+    samples_to_skip_start_  = 0; //n.get<int>(std::string("samplestoskipstart.value"))[0];
+    samples_to_skip_end_    = -1; //n.get<int>(std::string("samplestoskipend.value"))[0];
+
+    fov_vec_.push_back(r_space.fieldOfView_mm().x());
+    fov_vec_.push_back(r_space.fieldOfView_mm().y());
+    fov_vec_.push_back(r_space.fieldOfView_mm().z());
+
+    slices_ = e_limits.slice().present() ? e_limits.slice().get().maximum() + 1 : 1;
+    sets_ = e_limits.set().present() ? e_limits.set().get().maximum() + 1 : 1;
+
+    buffer_ = boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    image_headers_queue_ = 
+      boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> >(new ACE_Message_Queue<ACE_MT_SYNCH>[slices_*sets_]);
+
+    size_t bsize = sizeof(GadgetContainerMessage<ISMRMRD::ImageHeader>)*100*Nints_;
+
+    for( unsigned int i=0; i<slices_*sets_; i++ ){
+      image_headers_queue_[i].high_water_mark(bsize);
+      image_headers_queue_[i].low_water_mark(bsize);
+    }
+
+    GADGET_DEBUG2("smax:                    %f\n", smax_);
+    GADGET_DEBUG2("gmax:                    %f\n", gmax_);
+    GADGET_DEBUG2("Tsamp_ns:                %d\n", Tsamp_ns_);
+    GADGET_DEBUG2("Nints:                   %d\n", Nints_);
+    GADGET_DEBUG2("fov:                     %f\n", fov_);
+    GADGET_DEBUG2("krmax:                   %f\n", krmax_);
+    GADGET_DEBUG2("samples_to_skip_start_ : %d\n", samples_to_skip_start_);
+    GADGET_DEBUG2("samples_to_skip_end_   : %d\n", samples_to_skip_end_);
+    GADGET_DEBUG2("recon matrix_size_x    : %d\n", image_dimensions_recon_[0]);
+    GADGET_DEBUG2("recon matrix_size_y    : %d\n", image_dimensions_recon_[1]);
+
+    return GADGET_OK;
+  }
+
+  int gpuSpiralSensePrepGadget::
+  process(GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *m1,
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2)
+  {
+    // Noise should have been consumed by the noise adjust, but just in case...
+    //
+
+    bool is_noise = ISMRMRD::FlagBit(ISMRMRD::ACQ_IS_NOISE_MEASUREMENT).isSet(m1->getObjectPtr()->flags);
+    if (is_noise) {
+      m1->release();
+      return GADGET_OK;
+    }
+
+    if (!prepared_) {
+
+      int     nfov   = 1;         /*  number of fov coefficients.             */
+      int     ngmax  = 1e5;       /*  maximum number of gradient samples      */
+      double  *xgrad;             /*  x-component of gradient.                */
+      double  *ygrad;             /*  y-component of gradient.                */
+      double  *x_trajectory;
+      double  *y_trajectory;
+      double  *weighting;
+      int     ngrad;
+      //int     count;
+      double sample_time = (1.0*Tsamp_ns_) * 1e-9;
+
+      /*	call c-function here to calculate gradients */
+      calc_vds(smax_,gmax_,sample_time,sample_time,Nints_,&fov_,nfov,krmax_,ngmax,&xgrad,&ygrad,&ngrad);
+      samples_per_interleave_ = std::min(ngrad,static_cast<int>(m1->getObjectPtr()->number_of_samples));
+
+      GADGET_DEBUG2("Using %d samples per interleave\n", samples_per_interleave_);
+
+      /* Calcualte the trajectory and weights*/
+      calc_traj(xgrad, ygrad, samples_per_interleave_, Nints_, sample_time, krmax_, &x_trajectory, &y_trajectory, &weighting);
+
+      host_traj_ = boost::shared_ptr< hoNDArray<floatd2> >(new hoNDArray<floatd2>);
+      host_weights_ = boost::shared_ptr< hoNDArray<float> >(new hoNDArray<float>);
+
+      std::vector<size_t> trajectory_dimensions;
+      trajectory_dimensions.push_back(samples_per_interleave_*Nints_);
+
+      host_traj_->create(&trajectory_dimensions);
+      host_weights_->create(&trajectory_dimensions);
+
+      {
+	float* co_ptr = reinterpret_cast<float*>(host_traj_->get_data_ptr());
+	float* we_ptr =  reinterpret_cast<float*>(host_weights_->get_data_ptr());
+	
+	for (int i = 0; i < (samples_per_interleave_*Nints_); i++) {
+	  co_ptr[i*2]   = -x_trajectory[i]/2;
+	  co_ptr[i*2+1] = -y_trajectory[i]/2;
+	  we_ptr[i] = weighting[i];
+	}
+      }
+
+      delete [] xgrad;
+      delete [] ygrad;
+      delete [] x_trajectory;
+      delete [] y_trajectory;
+      delete [] weighting;
+
+      // Setup the NFFT plan
+      //
+
+      cuNDArray<floatd2> traj(*host_traj_);
+      dcw_buffer_ = boost::shared_ptr< cuNDArray<float> >( new cuNDArray<float>(*host_weights_) );
+	
+      nfft_plan_.setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, kernel_width_ );
+      nfft_plan_.preprocess(&traj, cuNFFT_plan<float,2>::NFFT_PREP_NC2C);
+
+      // Setup the non-Cartesian Sense encoding operator 
+      //
+      
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<float,2> >(new cuNonCartesianSenseOperator<float,2>);
+      E_->setup( from_std_vector<size_t,2>(image_dimensions_recon_), image_dimensions_recon_os_, kernel_width_ );
+      
+      // Setup cg solver if the csm/regularization image is to be based hereon
+      //
+
+      if( buffer_using_solver_ ){
+
+	E_->set_dcw(dcw_buffer_);
+
+	D_ = boost::shared_ptr< cuCgPreconditioner<float_complext> >( new cuCgPreconditioner<float_complext>() );
+	cg_.set_encoding_operator( E_ );
+	cg_.set_preconditioner( D_ );
+	cg_.set_max_iterations( 2 );
+	cg_.set_tc_tolerance( 1e-6 );
+	cg_.set_output_mode( cuCgSolver<float_complext>::OUTPUT_SILENT);
+      }
+
+      prepared_ = true;
+    }
+
+    // Allocate host data buffer if it is NULL
+    //
+
+    if (!host_data_buffer_.get()) {
+
+      std::vector<size_t> data_dimensions;
+      data_dimensions.push_back(samples_per_interleave_*interleaves_);
+      data_dimensions.push_back(m1->getObjectPtr()->active_channels);
+
+      host_data_buffer_ = boost::shared_array< hoNDArray<float_complext> >
+	(new hoNDArray<float_complext>[slices_*sets_]);
+      
+      if (!host_data_buffer_.get()) {
+	GADGET_DEBUG1("Unable to allocate array for host data buffer\n");
+	return GADGET_FAIL;
+      }
+
+      for (unsigned int i = 0; i < slices_*sets_; i++) {
+	host_data_buffer_[i].create(&data_dimensions);
+	host_data_buffer_[i].fill(0.0f);
+      }
+    }
+
+    // Allocate various counters if they are NULL
+    //
+
+    if( !image_counter_.get() ){
+      image_counter_ = boost::shared_array<long>(new long[slices_*sets_]);
+      for( unsigned int i=0; i<slices_*sets_; i++ )
+	image_counter_[i] = 0;
+    }
+
+    if( !interleaves_counter_singleframe_.get() ){
+      interleaves_counter_singleframe_ = boost::shared_array<long>(new long[slices_*sets_]);
+      for( unsigned int i=0; i<slices_*sets_; i++ )
+	interleaves_counter_singleframe_[i] = 0;
+    }
+
+    if( !interleaves_counter_multiframe_.get() ){
+      interleaves_counter_multiframe_ = boost::shared_array<long>(new long[slices_*sets_]);
+      for( unsigned int i=0; i<slices_*sets_; i++ )
+	interleaves_counter_multiframe_[i] = 0;
+    }
+
+    // Define some utility variables
+    //
+
+    unsigned int samples_to_copy = m1->getObjectPtr()->number_of_samples-samples_to_skip_end_;
+    unsigned int interleave = m1->getObjectPtr()->idx.kspace_encode_step_1;
+    unsigned int slice = m1->getObjectPtr()->idx.slice;
+    unsigned int set = m1->getObjectPtr()->idx.set;
+    unsigned int samples_per_channel =  host_data_buffer_[set*slices_+slice].get_size(0);
+
+    // Some book-keeping to keep track of the frame count
+    //
+
+    interleaves_counter_singleframe_[set*slices_+slice]++;
+    interleaves_counter_multiframe_[set*slices_+slice]++;
+
+    // Duplicate the profile to avoid double deletion in case problems are encountered below.
+    // Enque profile until all profiles for the reconstruction have been received.
+    //
+    
+    buffer_[set*slices_+slice].enqueue_tail(duplicate_profile(m1));
+    
+    // Copy profile into the accumulation buffer for csm/regularization estimation
+    //
+
+    ISMRMRD::AcquisitionHeader base_head = *m1->getObjectPtr();
+
+    if (samples_to_skip_end_ == -1) {
+      samples_to_skip_end_ = m1->getObjectPtr()->number_of_samples-samples_per_interleave_;
+      GADGET_DEBUG2("Adjusting samples_to_skip_end_ = %d\n", samples_to_skip_end_);
+    }
+
+    std::complex<float>* data_ptr = reinterpret_cast< std::complex<float>* >
+      (host_data_buffer_[set*slices_+slice].get_data_ptr());
+
+    std::complex<float>* profile_ptr = m2->getObjectPtr()->get_data_ptr();
+
+    for (unsigned int c = 0; c < m1->getObjectPtr()->active_channels; c++) {
+      memcpy(data_ptr+c*samples_per_channel+interleave*samples_to_copy,
+	     profile_ptr+c*m1->getObjectPtr()->number_of_samples, samples_to_copy*sizeof(std::complex<float>));
+    }
+
+    // Have we received sufficient data for a new frame?
+    //
+
+    bool is_last_scan_in_slice = 
+      ISMRMRD::FlagBit(ISMRMRD::ACQ_LAST_IN_SLICE).isSet(m1->getObjectPtr()->flags);
+
+    if (is_last_scan_in_slice) {
+
+      // This was the final profile of a frame
+      //
+
+      if( Nints_%interleaves_counter_singleframe_[set*slices_+slice] ){
+	GADGET_DEBUG1("Unexpected number of interleaves encountered in frame\n");
+	return GADGET_FAIL;
+      }
+
+      // Has the acceleration factor changed?
+      //
+
+      if( acceleration_factor_ != Nints_/interleaves_counter_singleframe_[set*slices_+slice] ){
+
+	GADGET_DEBUG1("Change of acceleration factor detected\n");
+	acceleration_factor_ =  Nints_/interleaves_counter_singleframe_[set*slices_+slice];
+
+	// The encoding operator needs to have its domain/codomain dimensions set accordingly
+	//
+	
+	if( buffer_using_solver_ ){
+
+	  std::vector<size_t> domain_dims = image_dimensions_recon_;
+	  
+	  std::vector<size_t> codomain_dims = *host_traj_->get_dimensions();
+	  codomain_dims.push_back(m1->getObjectPtr()->active_channels);
+	  
+	  E_->set_domain_dimensions(&domain_dims);
+	  E_->set_codomain_dimensions(&codomain_dims);
+
+	  cuNDArray<floatd2> traj(*host_traj_);
+	  E_->preprocess(&traj);
+	}
+      }
+
+      // Prepare an image header for this frame
+      //
+
+      GadgetContainerMessage<ISMRMRD::ImageHeader> *header = new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+      ISMRMRD::AcquisitionHeader *base_head = m1->getObjectPtr();
+
+      {
+	// Initialize header to all zeroes (there is a few fields we do not set yet)
+	ISMRMRD::ImageHeader tmp = {0};
+	*(header->getObjectPtr()) = tmp;
+      }
+
+      header->getObjectPtr()->version = base_head->version;
+
+      header->getObjectPtr()->matrix_size[0] = image_dimensions_recon_[0];
+      header->getObjectPtr()->matrix_size[1] = image_dimensions_recon_[1];
+      header->getObjectPtr()->matrix_size[2] = acceleration_factor_;
+
+      header->getObjectPtr()->field_of_view[0] = fov_vec_[0];
+      header->getObjectPtr()->field_of_view[1] = fov_vec_[1];
+      header->getObjectPtr()->field_of_view[2] = fov_vec_[2];
+
+      header->getObjectPtr()->channels = base_head->active_channels;
+      header->getObjectPtr()->slice = base_head->idx.slice;
+      header->getObjectPtr()->set = base_head->idx.set;
+
+      header->getObjectPtr()->acquisition_time_stamp = base_head->acquisition_time_stamp;
+      memcpy(header->getObjectPtr()->physiology_time_stamp, base_head->physiology_time_stamp, sizeof(uint32_t)*ISMRMRD_PHYS_STAMPS);
+
+      memcpy(header->getObjectPtr()->position, base_head->position, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->read_dir, base_head->read_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->phase_dir, base_head->phase_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->slice_dir, base_head->slice_dir, sizeof(float)*3);
+      memcpy(header->getObjectPtr()->patient_table_position, base_head->patient_table_position, sizeof(float)*3);
+
+      header->getObjectPtr()->image_data_type = ISMRMRD::DATA_COMPLEX_FLOAT;
+      header->getObjectPtr()->image_index = image_counter_[set*slices_+slice]++; 
+      header->getObjectPtr()->image_series_index = set*slices_+slice;
+
+      // Enque header until we are ready to assemble a Sense job
+      //
+
+      image_headers_queue_[set*slices_+slice].enqueue_tail(header);
+
+      // Check if it is time to reconstruct.
+      // I.e. prepare and pass a Sense job downstream...
+      //
+
+      if( !use_multiframe_grouping_ || 
+	  (use_multiframe_grouping_ && interleaves_counter_multiframe_[set*slices_+slice] == Nints_) ){
+
+	unsigned int num_coils = m1->getObjectPtr()->active_channels;
+	
+	// Compute coil images from the fully sampled data buffer
+	//
+
+	std::vector<size_t> image_dims;
+	image_dims.push_back(image_dimensions_recon_[0]);
+	image_dims.push_back(image_dimensions_recon_[1]);
+	image_dims.push_back(num_coils);
+	
+	cuNDArray<float_complext> image(&image_dims);
+	cuNDArray<float_complext> data(&host_data_buffer_[set*slices_+slice]);
+	
+	nfft_plan_.compute( &data, &image, dcw_buffer_.get(), cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C );
+
+	// Check if we need to compute a new csm
+	//
+	
+	if( propagate_csm_from_set_ < 0 || propagate_csm_from_set_ == set ){	  	  
+	  csm_ = estimate_b1_map<float,2>( &image ); // Estimates csm
+	}
+	else{
+	  //GADGET_DEBUG2("Set %d is reusing the csm from set %d\n", set, propagate_csm_from_set_);
+	  if( csm_.get() == 0x0 ){
+	    GADGET_DEBUG1("Error, csm has not been computed\n");
+	    return GADGET_FAIL;
+	  }	  
+	}
+	E_->set_csm(csm_);
+
+	// Compute regularization using basic coil combination
+	//
+	
+	image_dims.pop_back();
+	cuNDArray<float_complext> reg_image(&image_dims);
+	E_->mult_csm_conj_sum( &image, &reg_image );
+	
+	if( buffer_using_solver_ ){
+	  
+	  // Compute regularization using cg solver
+	  //
+	  
+	  // Define preconditioning weights
+	  boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm_.get()).get(), 2);
+	  reciprocal_sqrt_inplace(_precon_weights.get());	
+	  boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
+	  _precon_weights.reset();
+	  D_->set_weights( precon_weights );
+	  
+	  // Solve from the plain coil combination
+	  reg_image = *cg_.solve_from_rhs(&reg_image);
+	}
+
+	// Get ready to fill in the Sense job
+	//
+
+	boost::shared_ptr< hoNDArray<float_complext> > csm_host = csm_->to_host();
+	boost::shared_ptr< hoNDArray<float_complext> > reg_host = reg_image.to_host();
+
+	unsigned int profiles_buffered = buffer_[set*slices_+slice].message_count();
+
+	std::vector<size_t> ddimensions;
+	ddimensions.push_back(samples_per_interleave_*interleaves_counter_singleframe_[set*slices_+slice]*
+			      ((use_multiframe_grouping_) ? acceleration_factor_ : 1));
+	ddimensions.push_back(num_coils);
+	
+	boost::shared_ptr< hoNDArray<float_complext> > data_host(new hoNDArray<float_complext>(&ddimensions));
+
+	ddimensions.clear();
+	ddimensions.push_back(samples_per_interleave_*interleaves_counter_singleframe_[set*slices_+slice]);
+	ddimensions.push_back((use_multiframe_grouping_) ? acceleration_factor_ : 1);
+
+	boost::shared_ptr< hoNDArray<floatd2> > traj_host(new hoNDArray<floatd2>(&ddimensions));
+	boost::shared_ptr< hoNDArray<float> > dcw_host(new hoNDArray<float>(&ddimensions));
+	
+	for (unsigned int p = 0; p < profiles_buffered; p++) {
+	  ACE_Message_Block* mbq;
+	  if (buffer_[set*slices_+slice].dequeue_head(mbq) < 0) {
+	    GADGET_DEBUG1("Message dequeue failed\n");
+	    return GADGET_FAIL;
+	  }
+
+	  GadgetContainerMessage<ISMRMRD::AcquisitionHeader>* acq = 
+	    AsContainerMessage<ISMRMRD::AcquisitionHeader>(mbq);
+
+	  GadgetContainerMessage< hoNDArray< std::complex<float> > >* daq = 
+	    AsContainerMessage<hoNDArray< std::complex<float> > >(mbq->cont());
+
+	  if (!acq || !daq) {
+	    GADGET_DEBUG1("Unable to interpret data on message Q\n");
+	    return GADGET_FAIL;
+	  }
+
+	  for (unsigned int c = 0; c < num_coils; c++) {
+	    float_complext* data_ptr = data_host->get_data_ptr();
+	    data_ptr += c*samples_per_interleave_*profiles_buffered+p*samples_per_interleave_;
+
+	    std::complex<float>* r_ptr = daq->getObjectPtr()->get_data_ptr();
+	    r_ptr += c*daq->getObjectPtr()->get_size(0);
+
+	    memcpy(data_ptr,r_ptr,samples_per_interleave_*sizeof(float_complext));
+	  }
+
+	  floatd2* traj_ptr = traj_host->get_data_ptr();
+	  traj_ptr += p*samples_per_interleave_;
+
+	  floatd2* t_ptr = host_traj_->get_data_ptr();
+	  t_ptr += acq->getObjectPtr()->idx.kspace_encode_step_1*samples_per_interleave_;
+
+	  memcpy(traj_ptr,t_ptr,samples_per_interleave_*sizeof(floatd2));
+
+	  float* dcw_ptr = dcw_host->get_data_ptr();
+	  dcw_ptr += p*samples_per_interleave_;
+
+	  float* d_ptr = host_weights_->get_data_ptr();
+	  d_ptr += acq->getObjectPtr()->idx.kspace_encode_step_1*samples_per_interleave_;
+
+	  memcpy(dcw_ptr,d_ptr,samples_per_interleave_*sizeof(float));
+
+	  mbq->release();
+	}
+
+	GadgetContainerMessage< SenseJob >* m4 = new GadgetContainerMessage< SenseJob >();
+
+	m4->getObjectPtr()->dat_host_ = data_host;
+	m4->getObjectPtr()->csm_host_ = csm_host;
+	m4->getObjectPtr()->reg_host_ = reg_host;
+	m4->getObjectPtr()->tra_host_ = traj_host;
+	m4->getObjectPtr()->dcw_host_ = dcw_host;
+
+	// Pull the image headers out of the queue
+	//
+	
+	long frames_per_reconstruction = (use_multiframe_grouping_) ? acceleration_factor_ : 1;
+      
+	if( image_headers_queue_[set*slices_+slice].message_count() != frames_per_reconstruction ){
+	  m4->release();
+	  GADGET_DEBUG2("Unexpected size of image header queue: %d, %d\n", 
+			image_headers_queue_[set*slices_+slice].message_count(), frames_per_reconstruction);
+	  return GADGET_FAIL;
+	}
+	
+	m4->getObjectPtr()->image_headers_ =
+	  boost::shared_array<ISMRMRD::ImageHeader>( new ISMRMRD::ImageHeader[frames_per_reconstruction] );
+	
+	for( unsigned int i=0; i<frames_per_reconstruction; i++ ){	
+	  
+	  ACE_Message_Block *mbq;
+	  
+	  if( image_headers_queue_[set*slices_+slice].dequeue_head(mbq) < 0 ) {
+	    m4->release();
+	    GADGET_DEBUG1("Image header dequeue failed\n");
+	    return GADGET_FAIL;
+	  }
+	  
+	  GadgetContainerMessage<ISMRMRD::ImageHeader> *m = AsContainerMessage<ISMRMRD::ImageHeader>(mbq);
+	  m4->getObjectPtr()->image_headers_[i] = *m->getObjectPtr();
+	  m->release();
+	}
+
+	// The Sense Job needs an image header as well. 
+	// Let us just copy the initial one...
+	
+	GadgetContainerMessage<ISMRMRD::ImageHeader> *m3 = new GadgetContainerMessage<ISMRMRD::ImageHeader>;
+	*m3->getObjectPtr() = m4->getObjectPtr()->image_headers_[0];
+	m3->cont(m4);
+	
+	if (this->next()->putq(m3) < 0) {
+	  GADGET_DEBUG1("Failed to put job on queue.\n");
+	  m3->release();
+	  return GADGET_FAIL;
+	}
+	interleaves_counter_multiframe_[set*slices_+slice] = 0;
+      }
+      interleaves_counter_singleframe_[set*slices_+slice] = 0;
+    }
+    m1->release();
+    return GADGET_OK;
+  }
+
+  GadgetContainerMessage<ISMRMRD::AcquisitionHeader>*
+  gpuSpiralSensePrepGadget::duplicate_profile( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *profile )
+  {
+    GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *copy = 
+      new GadgetContainerMessage<ISMRMRD::AcquisitionHeader>();
+    
+    GadgetContainerMessage< hoNDArray< std::complex<float> > > *cont_copy = 
+      new GadgetContainerMessage< hoNDArray< std::complex<float> > >();
+    
+    *copy->getObjectPtr() = *profile->getObjectPtr();
+    *(cont_copy->getObjectPtr()) = *(AsContainerMessage<hoNDArray< std::complex<float> > >(profile->cont())->getObjectPtr());
+    
+    copy->cont(cont_copy);
+    return copy;
+  }
+
+  GADGET_FACTORY_DECLARE(gpuSpiralSensePrepGadget)
+}
diff --git a/gadgets/spiral/gpuSpiralSensePrepGadget.h b/gadgets/spiral/gpuSpiralSensePrepGadget.h
new file mode 100644
index 0000000..277927a
--- /dev/null
+++ b/gadgets/spiral/gpuSpiralSensePrepGadget.h
@@ -0,0 +1,92 @@
+#ifndef gpuSpiralSensePrepGadget_H
+#define gpuSpiralSensePrepGadget_H
+#pragma once
+
+#include "gadgetron_spiral_export.h"
+#include "Gadget.h"
+#include "GadgetMRIHeaders.h"
+#include "cuCgSolver.h"
+#include "cuNonCartesianSenseOperator.h"
+#include "cuCgPreconditioner.h"
+#include "cuNFFT.h"
+#include "hoNDArray.h"
+#include "vector_td.h"
+#include "cuNFFT.h"
+
+#include <ismrmrd.h>
+#include <complex>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+
+namespace Gadgetron{
+
+  class EXPORTGADGETS_SPIRAL gpuSpiralSensePrepGadget :
+    public Gadget2< ISMRMRD::AcquisitionHeader, hoNDArray< std::complex<float> > >
+  {
+
+  public:
+    GADGET_DECLARE(gpuSpiralSensePrepGadget);
+
+    gpuSpiralSensePrepGadget();
+    virtual ~gpuSpiralSensePrepGadget();
+
+  protected:
+
+    virtual int process_config(ACE_Message_Block* mb);
+    
+    virtual int process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader >* m1,
+			GadgetContainerMessage< hoNDArray< std::complex<float> > > * m2);
+    
+    virtual GadgetContainerMessage<ISMRMRD::AcquisitionHeader>*
+      duplicate_profile( GadgetContainerMessage<ISMRMRD::AcquisitionHeader> *profile );
+    
+  private:
+    int samples_to_skip_start_;
+    int samples_to_skip_end_;
+    int samples_per_interleave_;
+    int interleaves_;
+    int slices_;
+    int sets_;
+    boost::shared_array<long> image_counter_;
+    int device_number_;
+
+    long    Tsamp_ns_;
+    long    Nints_;
+    boost::shared_array<long> interleaves_counter_singleframe_;
+    boost::shared_array<long> interleaves_counter_multiframe_;
+    long    acceleration_factor_;
+    double  gmax_;
+    double  smax_;
+    double  krmax_;
+    double  fov_;
+
+    bool prepared_;
+    bool use_multiframe_grouping_;
+    bool buffer_using_solver_;
+
+    int propagate_csm_from_set_;
+
+    float kernel_width_;
+    float oversampling_factor_;
+
+    boost::shared_ptr< hoNDArray<floatd2> > host_traj_;
+    boost::shared_ptr< hoNDArray<float> > host_weights_;
+    
+    boost::shared_array< hoNDArray<float_complext> > host_data_buffer_;
+    boost::shared_ptr< cuNDArray<float> > dcw_buffer_;
+
+    std::vector<size_t> fov_vec_;
+    std::vector<size_t> image_dimensions_recon_;
+    uint64d2 image_dimensions_recon_os_;
+
+    cuNFFT_plan<float,2> nfft_plan_;
+    cuCgSolver<float_complext> cg_;
+    boost::shared_ptr< cuNDArray<float_complext> > csm_;
+    boost::shared_ptr< cuNonCartesianSenseOperator<float,2> > E_;
+    boost::shared_ptr< cuCgPreconditioner<float_complext> > D_;
+
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > buffer_;
+    boost::shared_array< ACE_Message_Queue<ACE_MT_SYNCH> > image_headers_queue_;
+  };
+}
+#endif //gpuSpiralSensePrepGadget_H
diff --git a/gadgets/spiral/vds.cpp b/gadgets/spiral/vds.cpp
new file mode 100644
index 0000000..0591509
--- /dev/null
+++ b/gadgets/spiral/vds.cpp
@@ -0,0 +1,495 @@
+#include "vds.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#define GAMMA 	4258.0		/* Hz/G */
+#define PI	3.141592	/* pi */
+
+#define DEBUG_VDS	0
+/* #define TESTCODE 	For testing as regular C code... */
+
+/*
+  %
+  %	VARIABLE DENSITY SPIRAL GENERATION:
+  %	----------------------------------
+  %
+  %	This is a general description of how the following C code
+  %	works.  This text is taken from a matlab script, vds.m, from
+  %	which the C code was derived.  However, note that the C code
+  %	runs considerably faster.
+  %
+  %
+  %	Function generates variable density spiral which traces
+  %	out the trajectory
+  %				 
+  %			k(t) = r(t) exp(i*q(t)), 		[1]
+  %
+  %	Where q IS THE SAME AS theta, and r IS THE SAME AS kr.
+  %
+  %		r and q are chosen to satisfy:
+  %
+  %		1) Maximum gradient amplitudes and slew rates.
+  %		2) Maximum gradient due to FOV, where FOV can
+  %		   vary with k-space radius r, as
+  %
+  %			FOV(r) = F0 + F1*r + F2*r*r 		[2]
+  %
+  %
+  %	INPUTS:
+  %	-------
+  %	smax = maximum slew rate G/cm/s
+  %	gmax = maximum gradient G/cm (limited by Gmax or FOV)
+  %	T = sampling period (s) for gradient AND acquisition.
+  %	N = number of interleaves.
+  %	F0,F1,F2 = FOV coefficients with respect to r - see above.
+  %	rmax= value of k-space radius at which to stop (cm^-1).
+  %		rmax = 1/(2*resolution);
+  %
+  %
+  %	OUTPUTS:
+  %	--------
+  %	k = k-space trajectory (kx+iky) in cm-1.
+  %	g = gradient waveform (Gx+iGy) in G/cm.
+  %	s = derivative of g (Sx+iSy) in G/cm/s.
+  %	time = time points corresponding to above (s).
+  %	r = k-space radius vs time (used to design spiral)
+  %	theta = atan2(ky,kx) = k-space angle vs time.
+  %
+  %
+  %	METHODS:
+  %	--------
+  %	Let r1 and r2 be the first derivatives of r in [1].	
+  %	Let q1 and q2 be the first derivatives of theta in [1].	
+  %	Also, r0 = r, and q0 = theta - sometimes both are used.
+  %	F = F(r) defined by F0,F1,F2.
+  %
+  %	Differentiating [1], we can get G = a(r0,r1,q0,q1,F)	
+  %	and differentiating again, we get S = b(r0,r1,r2,q0,q1,q2,F)
+  %
+  %	(functions a() and b() are reasonably easy to obtain.)
+  %
+  %	FOV limits put a constraint between r and q:
+  %
+  %		dr/dq = N/(2*pi*F)				[3]	
+  %
+  %	We can use [3] and the chain rule to give 
+  %
+  %		q1 = 2*pi*F/N * r1				[4]
+  %
+  %	and
+  %
+  %		q2 = 2*pi/N*dF/dr*r1^2 + 2*pi*F/N*r2		[5]
+  %
+  %
+  %
+  %	Now using [4] and [5], we can substitute for q1 and q2
+  %	in functions a() and b(), giving
+  %
+  %		G = c(r0,r1,F)
+  %	and 	S = d(r0,r1,r2,F,dF/dr)
+  %
+  %
+  %	Using the fact that the spiral should be either limited
+  %	by amplitude (Gradient or FOV limit) or slew rate, we can
+  %	solve 
+  %		|c(r0,r1,F)| = |Gmax|  				[6]
+  %
+  %	analytically for r1, or
+  %	
+  %	  	|d(r0,r1,r2,F,dF/dr)| = |Smax|	 		[7]
+  %
+  %	analytically for r2.
+  %
+  %	[7] is a quadratic equation in r2.  The smaller of the 
+  %	roots is taken, and the real part of the root is used to
+  %	avoid possible numeric errors - the roots should be real
+  %	always.
+  %
+  %	The choice of whether or not to use [6] or [7], and the
+  %	solving for r2 or r1 is done by calcthetadotdot().
+  %
+  %	Once the second derivative of theta(q) or r is obtained,
+  %	it can be integrated to give q1 and r1, and then integrated
+  %	again to give q and r.  The gradient waveforms follow from
+  %	q and r. 	
+  %
+  %	Brian Hargreaves -- Sept 2000.
+  %
+  %
+*/
+
+namespace Gadgetron{
+
+
+  /* ----------------------------------------------------------------------- */
+  void calcthetadotdot(double slewmax, double gradmax, double kr, 
+		       double krdot, double Tgsample, double Tdsample, int Ninterleaves,
+		       double* fov, int numfov, double* thetadotdot, double* krdotdot)
+  /*
+   * Function calculates the 2nd derivative of kr and theta at each
+   * sample point within calc_vds().  ie, this is the iterative loop
+   * for calc_vds.  See the text at the top of this file for more details
+   * */
+
+  //double slewmax;		/*	Maximum slew rate, G/cm/s		*/
+  //double gradmax;		/* 	maximum gradient amplitude, G/cm	*/
+  //double kr;		/* 	Current kr. */
+  //double krdot;		/*	Current krdot. */
+  //double Tgsample;	/*	Gradient Sample period (s) 	*/
+  //double Tdsample;	/*	Data Sample period (s) 		*/
+  //int Ninterleaves;	/*	Number of interleaves			*/
+  //double *fov;		/*	FOV coefficients		*/
+  //int numfov;		/*	Number of FOV coefficients		*/
+  //double *thetadotdot;	/*	[output] 2nd derivative of theta.	*/
+  //double *krdotdot;	/*	[output] 2nd derivative of kr		*/
+
+  /* ----------------------------------------------------------------------- */
+  {
+    double fovval=0;	/* FOV for this value of kr	*/
+    double dfovdrval=0;	/* dFOV/dkr for this value of kr	*/
+    double gmaxfov;		/* FOV-limited Gmax.	*/
+    double maxkrdot;
+    int count;
+
+    double tpf;	/* Used to simplify expressions. */
+    double tpfsq;	/* 	" 		"        */
+
+    double qdfA, qdfB, qdfC;	/* Quadratic formula coefficients */
+    double rootparta,rootpartb;
+
+
+
+    if (DEBUG_VDS>1)
+      {
+	printf("calcthetadotdot:  slewmax=%8.2f, gmax=%6.2f, \n",
+	       slewmax,gradmax);
+	printf("        kr=%8.4f, Tg=%9.6f, N=%d, nfov=%d \n", 
+	       kr,Tgsample,Ninterleaves,numfov);
+      }
+
+    /* Calculate the actual FOV and dFOV/dkr for this R,
+     * based on the fact that the FOV is expressed 
+     * as a polynomial in kr.*/
+
+    for (count=0; count < numfov; count++)
+      {
+	fovval = fovval + fov[count]*pow(kr,count);
+	if (count > 0)
+	  dfovdrval = dfovdrval + count*fov[count]*pow(kr,count-1);
+      }
+
+    /* Calculate FOV limit on gmax.  This is the rate of motion along
+     * a trajectory, and really should not be a limitation.  Thus,
+     * it is reasonable to comment out the following lines. */
+
+    gmaxfov = 1/GAMMA / fovval / Tdsample;	
+    if (gradmax > gmaxfov)
+      gradmax = gmaxfov;	
+
+
+    /* Maximum dkr/dt, based on gradient amplitude.  */
+
+    maxkrdot = sqrt(pow(GAMMA*gradmax,2) / (1+pow(2*PI*fovval*kr/Ninterleaves,2)));
+    if (DEBUG_VDS>1)
+      printf("calcthetadotdot:  maxkrdot = %g \n",maxkrdot);
+
+    /* These two are just to simplify expressions below */
+    tpf = 2*PI*fovval/Ninterleaves;
+    tpfsq = pow(tpf,2);
+    if (DEBUG_VDS>1)
+      printf("calcthetadotdot:  tpf = %8.4f,  tpfsq = %8.4f  \n",tpf,tpfsq);
+
+
+
+
+    if (krdot > maxkrdot)	/* Then choose krdotdot so that krdot is in range */
+      {	
+	*krdotdot = (maxkrdot - krdot)/Tgsample;
+      }
+
+    else			/* Choose krdotdot based on max slew rate limit. */
+      {
+
+	/* Set up for quadratic formula solution. */
+
+	qdfA = 1+tpfsq*kr*kr;
+	qdfB = 2*tpfsq*kr*krdot*krdot + 
+	  2*tpfsq/fovval*dfovdrval*kr*kr*krdot*krdot;
+	qdfC = pow(tpfsq*kr*krdot*krdot,2) + 4*tpfsq*pow(krdot,4) +
+	  pow(tpf*dfovdrval/fovval*kr*krdot*krdot,2) +
+	  4*tpfsq*dfovdrval/fovval*kr*pow(krdot,4) -
+	  pow(GAMMA*slewmax,2);
+
+	if (DEBUG_VDS>1)
+	  printf("calcthetadotdot:  qdfA, qdfB, qdfC = %g, %g, %g \n",
+		 qdfA, qdfB, qdfC);
+
+	rootparta = -qdfB/(2*qdfA);
+	rootpartb = qdfB*qdfB/(4*qdfA*qdfA) - qdfC/qdfA;
+	if (DEBUG_VDS>1)
+	  printf("calcthetadotdot:  rootparta, rootpartb = %g, %g \n",
+		 rootparta, rootpartb);
+
+	if (rootpartb < 0)	/* Safety check - if complex, take real part.*/
+
+	  *krdotdot = rootparta;
+
+	else
+	  *krdotdot = rootparta + sqrt(rootpartb);
+
+
+	/* Could check resulting slew rate here, as in q2r21.m. */
+      }
+
+    /* Calculate thetadotdot */
+
+	
+    *thetadotdot = tpf*dfovdrval/fovval*krdot*krdot + tpf*(*krdotdot);
+
+    if (DEBUG_VDS>1)
+      printf("calcthetadot:  r=%8.4f,  r'=%8.4f,  r''=%g  q''=%g \n",
+	     kr,krdot,*krdotdot,*thetadotdot);
+
+  }
+
+
+  /* ----------------------------------------------------------------------- */
+  void EXPORTGADGETS_SPIRAL 
+  calc_vds(double slewmax,double gradmax,double Tgsample,double Tdsample,int Ninterleaves,
+	   double* fov, int numfov,double krmax,
+	   int ngmax, double** xgrad,double** ygrad,int* numgrad)
+
+  /*	Function designs a variable-density spiral gradient waveform
+   *	that is defined by a number of interleaves, resolution (or max number
+   *	of samples), and field-of-view.  
+   *	The field-of-view is a polynomial function of the
+   *	k-space radius, so fov is an array of coefficients so that
+   *
+   *	FOV = fov[0]+fov[1]*kr+fov[2]*kr^2+ ... +fov[numfov-1]*kr^(numfov-1)
+   *
+   * 	Gradient design is subject to a constant-slew-rate-limit model,
+   * 	with maximum slew rate slewmax, and maximum gradient amplitude
+   * 	of gradmax.  
+   *
+   * 	Tgsample is the gradient sampling rate, and Tdsample is the data
+   * 	sampling rate.  It is highly recommended to OVERSAMPLE the gradient
+   * 	in the design to make the integration more stable.
+   *
+   * */
+
+  //double slewmax;		/*	Maximum slew rate, G/cm/s		*/
+  //double gradmax;		/* 	maximum gradient amplitude, G/cm	*/
+  //double Tgsample;	/*	Gradient Sample period (s)		*/
+  //double Tdsample;	/*	Data Sample period (s)			*/
+  //int Ninterleaves;	/*	Number of interleaves			*/
+  //double *fov;		/*	FOV coefficients		*/
+  //int numfov;		/*	Number of FOV coefficients		*/
+  //double krmax;		/*	Maximum k-space extent (/cm)		*/
+  //int ngmax;		/*	Maximum number of gradient samples	*/
+  //double **xgrad;		/* 	[output] X-component of gradient (G/cm) */
+  //double **ygrad;		/*	[output] Y-component of gradient (G/cm)	*/
+  //int *numgrad;		/* 	[output] Number of gradient samples */
+
+  /* ----------------------------------------------------------------------- */
+  {
+    int gradcount=0;
+
+    double kr=0;			/* Current value of kr	*/
+    double krdot = 0;		/* Current value of 1st derivative of kr */
+    double krdotdot = 0;		/* Current value of 2nd derivative of kr */
+
+    double theta=0;			/* Current value of theta */
+    double thetadot=0;		/* Current value of 1st derivative of theta */
+    double thetadotdot=0;		/* Current value of 2nd derivative of theta */
+
+    double lastkx=0;		/* x-component of last k-location. */
+    double lastky=0;		/* y-component of last k-location */
+    double kx, ky;			/* x and y components of current k-location */
+
+    double *gxptr, *gyptr;		/* Pointers to gradient variables. */
+
+
+
+
+    if (DEBUG_VDS>0)
+      printf("calc_vds:  First run. \n");
+
+    /* First just find the gradient length. */
+
+    while ((kr < krmax) && (gradcount < ngmax))
+      {
+	calcthetadotdot(slewmax,gradmax,kr,krdot,Tgsample,Tdsample,
+			Ninterleaves, fov,numfov, &thetadotdot, &krdotdot);
+
+	/* Integrate to obtain new values of kr, krdot, theta and thetadot:*/
+
+	thetadot = thetadot + thetadotdot * Tgsample;
+	theta = theta + thetadot * Tgsample;
+
+	krdot = krdot + krdotdot * Tgsample;
+	kr = kr + krdot * Tgsample;
+
+	gradcount++;
+
+      }
+
+
+
+    /* Allocate memory for gradients. */
+
+    *numgrad = gradcount;
+    if (DEBUG_VDS>0)
+      printf("Allocating for %d gradient points. \n",*numgrad);
+
+    //*xgrad = (double *)malloc(*numgrad*sizeof(double));
+    //*ygrad = (double *)malloc(*numgrad*sizeof(double));
+
+    *xgrad = new double[*numgrad*sizeof(double)];
+    *ygrad = new double[*numgrad*sizeof(double)];
+
+    /* Reset parameters */
+
+    kr=0;
+    krdot=0;
+    theta=0;
+    thetadot=0;
+    gradcount=0;
+    gxptr = *xgrad;
+    gyptr = *ygrad;
+
+
+    /* Now re-calculate gradient to find length. */
+
+    if (DEBUG_VDS>0)
+      printf("calc_vds:  First run. \n");
+
+    while ((kr < krmax) && (gradcount < ngmax))
+      {
+	calcthetadotdot(slewmax,gradmax,kr,krdot,Tgsample,Tdsample,
+			Ninterleaves, fov,numfov, &thetadotdot, &krdotdot);
+
+	/* Integrate to obtain new values of kr, krdot, theta and thetadot:*/
+
+	thetadot = thetadot + thetadotdot * Tgsample;
+	theta = theta + thetadot * Tgsample;
+
+	krdot = krdot + krdotdot * Tgsample;
+	kr = kr + krdot * Tgsample;
+
+	/* Define current gradient values from kr and theta. */
+
+	kx = kr * cos(theta);
+	ky = kr * sin(theta);
+	*gxptr++ = (1/GAMMA/Tgsample) * (kx-lastkx);
+	*gyptr++ = (1/GAMMA/Tgsample) * (ky-lastky);
+	lastkx = kx;
+	lastky = ky;
+
+	if (DEBUG_VDS>0)
+	  printf("Current kr is %6.3f \n",kr);
+
+	gradcount++;
+      }
+
+  }
+ 
+
+
+  /* ----------------------------------------------------------------------- */
+  void EXPORTGADGETS_SPIRAL 
+  calc_traj(double* xgrad, double* ygrad, int ngrad, int Nints, double Tgsamp, double krmax,
+	    double** x_trajectory, double** y_trajectory,
+	    double** weights) //, double** y_weights)
+  /*
+   *inputs: 
+   *      xgrad   X gradient waveform
+   *      ygrad   Y gradient waveform
+   *      ngrad   number of gradient samples
+   *      Nints   number of interleaves
+   *      Tgsamp  sampling time for gradients
+   *
+   *outputs:
+   *      x_trajectory    X position in k-space
+   *      y_trajectory    Y position in k-space
+   *      x_weights       X weighting
+   *      y_weights       Y weighting
+   *
+   **/
+  {
+    int     gradcount   =0;
+    double  x_tr        =0.0;		/* Current value of x_traj	*/
+    double  y_tr        =0.0;     /* Current value of -traj */
+    double  rotation    =0.0;       /* rotation of trajectory */
+    
+    double  abs_w       =0.0;
+    double  ang_g       =0.0;
+    double  ang_t       =0.0;
+    double  tp_w        =0.0;     /* crrent weight value */
+    
+    double  *txptr, *typtr;		/* Pointers to trajectory variables. */
+    double  *wptr;      		/* Pointers to weight variables. */
+    
+    *x_trajectory   = new double[(ngrad*Nints)*sizeof(double)];
+    *y_trajectory   = new double[(ngrad*Nints)*sizeof(double)];
+    *weights        = new double[(ngrad*Nints)*sizeof(double)];
+    
+    txptr       = *x_trajectory;
+    typtr       = *y_trajectory;
+    wptr        = *weights;
+    
+    int inter = 0;
+    for(inter = 0; inter < Nints; inter++)
+      {
+        rotation = (inter * 2 * PI)/Nints;
+        x_tr = 0;
+        y_tr = 0;
+        float x_temp, y_temp;
+        for(gradcount = 0; gradcount < ngrad; gradcount++)
+	  {
+            if (gradcount > 0)
+	      {
+                x_tr += (GAMMA)*xgrad[gradcount-1]*Tgsamp;
+                y_tr += (GAMMA)*ygrad[gradcount-1]*Tgsamp;
+	      }
+
+            x_temp = (x_tr * cos(rotation)) + (y_tr * sin(rotation));
+            y_temp = -(x_tr * sin(rotation)) + (y_tr * cos(rotation));
+            *(txptr++) = x_temp/krmax;
+            *(typtr++) = y_temp/krmax;      
+  
+            //abs(g(:)
+            abs_w   = sqrt((pow(xgrad[gradcount],2)) + (pow(ygrad[gradcount],2)));
+
+            if(xgrad[gradcount] == 0.0)
+	      {
+                ang_g = PI/2;
+	      }
+            else
+	      {
+		ang_g   =  atan2(ygrad[gradcount], xgrad[gradcount]);   //angle of gradient
+	      }
+            
+            if(x_tr == 0.0)
+	      {
+                ang_t = PI/2;
+	      }  
+            else
+	      {
+                ang_t   =  atan2(y_tr, x_tr);                          // angle of trajectory
+	      }
+        
+	    tp_w    = sin(ang_g-ang_t);       
+	    tp_w    = sqrt(pow(tp_w, 2));    //abs(tp_w);
+	    //              mexPrintf("tp_w = %f\n", tp_w);
+	    tp_w    = abs_w * tp_w;
+       
+	    //       mexPrintf("abs_w = %f, ang_g =%f, ang_t = %f, tp_w = %f\n",abs_w, ang_g, ang_t, tp_w);
+        
+            *wptr++ = tp_w;
+	    //g = gradients, k = trajectory
+	    //        weights = abs(g(:)) .* abs(sin(angle(g(:))-angle(k(:))));
+	  }    
+      }    
+  }
+}
diff --git a/gadgets/spiral/vds.h b/gadgets/spiral/vds.h
new file mode 100644
index 0000000..1399464
--- /dev/null
+++ b/gadgets/spiral/vds.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "gadgetron_spiral_export.h"
+
+namespace Gadgetron{
+
+  void EXPORTGADGETS_SPIRAL 
+  calc_vds(double slewmax,double gradmax,double Tgsample,double Tdsample,int Ninterleaves,
+	   double* fov, int numfov,double krmax,
+	   int ngmax, double** xgrad,double** ygrad,int* numgrad);
+  
+  void EXPORTGADGETS_SPIRAL 
+  calc_traj(double* xgrad, double* ygrad, int ngrad, int Nints, double Tgsamp, double krmax,
+	    double** x_trajectory, double** y_trajectory, double** weights);  
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..556bca1
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,62 @@
+ENABLE_TESTING()
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${Boost_INCLUDE_DIR}
+  ${ARMADILLO_INCLUDE_DIRS}
+  ${GTEST_INCLUDE_DIRS}
+  )
+
+if (CUDA_FOUND)
+    include_directories(${CUDA_INCLUDE_DIRS})
+endif (CUDA_FOUND)
+
+link_libraries(cpucore 
+    cpucore_math
+    ${BOOST_LIBRARIES}
+    ${GTEST_LIBRARIES} 
+    ${ARMADILLO_LIBRARIES}
+    )
+    
+if ( CUDA_FOUND )
+    cuda_add_executable(test_all 
+      tests.cpp 
+      hoNDArray_elemwise_test.cpp 
+      hoNDArray_operators_test.cpp 
+      hoNDArray_blas_test.cpp 
+      hoNDArray_utils_test.cpp
+      vector_td_test.cpp
+      cuNDArray_elemwise_test.cpp 
+      cuNDArray_operators_test.cpp 
+      cuNDArray_blas_test.cpp 
+      cuNDArray_utils_test.cpp
+      vector_td_test.cpp
+      cuVector_td_test_kernels.h 
+      cuVector_td_test_kernels.cu 
+      )
+else ( CUDA_FOUND )
+    add_executable(test_all 
+      tests.cpp 
+      hoNDArray_elemwise_test.cpp 
+      hoNDArray_operators_test.cpp 
+      hoNDArray_blas_test.cpp 
+      hoNDArray_utils_test.cpp
+      )
+endif ( CUDA_FOUND )
+
+if ( CUDA_FOUND )
+  target_link_libraries(test_all 
+    gpucore
+    )
+elseif ( CUDA_FOUND )  
+
+endif ( CUDA_FOUND )
+
+add_test(test_all test_all)
diff --git a/test/cuNDArray_Vector_td_test.cpp b/test/cuNDArray_Vector_td_test.cpp
new file mode 100644
index 0000000..f5b2279
--- /dev/null
+++ b/test/cuNDArray_Vector_td_test.cpp
@@ -0,0 +1,50 @@
+/*
+ * cuGTBLAS_test.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+#include "gtest/gtest.h"
+
+
+#include <vector>
+#include "complext.h"
+#include "cuNDArray.h"
+#include "vector_td_utilities.h"
+
+using namespace Gadgetron;
+using testing::Types;
+template <typename T> class cuNDArray_vector_td_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 unsigned int vdims[] = {37}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+		 cuData = cuNDArray<vector_td<T,3> >(&dims);
+		 cuData.clear();
+	}
+	 cuNDArray<vector_td<T,3> > cuData;
+	 std::vector<unsigned int> dims;
+
+
+};
+
+//typedef Types<float,double,float_complext,double_complext> Implementations;
+typedef Types<float,double> Implementations;
+
+TYPED_TEST_CASE(cuNDArray_vector_td_Test, Implementations);
+
+TYPED_TEST(cuNDArray_vector_td_Test,absTest){
+	this->cuData.fill(vector_td<TypeParam,3>(-2));
+	this->cuData.abs();
+	vector_td<TypeParam,3> expected(2);
+	vector_td<TypeParam,3> result = this->cuData.get_device_ptr()[2];
+	EXPECT_EQ(expected,result);
+}
+
+TYPED_TEST(cuNDArray_vector_td_Test,sqrtTest){
+	this->cuData.fill(vector_td<TypeParam,3>(12.1));
+	this->cuData.sqrt();
+	vector_td<TypeParam,3> expected(TypeParam(3.478505426));
+	vector_td<TypeParam,3> result = this->cuData.get_device_ptr()[2];
+	EXPECT_FLOAT_EQ(result[1],expected[1]);
+}
diff --git a/test/cuNDArray_blas_test.cpp b/test/cuNDArray_blas_test.cpp
new file mode 100644
index 0000000..6aec76e
--- /dev/null
+++ b/test/cuNDArray_blas_test.cpp
@@ -0,0 +1,156 @@
+#include "cuNDArray_blas.h"
+#include "cuNDArray_elemwise.h"
+
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_blas_Real : public ::testing::Test 
+{
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+
+TYPED_TEST_CASE(cuNDArray_blas_Real, realImplementations);
+
+TYPED_TEST(cuNDArray_blas_Real,dotTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+  fill(&this->Array2,TypeParam(2));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,axpyTest){
+  fill(&this->Array,TypeParam(71));
+  fill(&this->Array2,TypeParam(97));
+  axpy(TypeParam(11),&this->Array,&this->Array2);
+  TypeParam val = this->Array2[10];
+  EXPECT_FLOAT_EQ(878,real(val));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,nrm2Test){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3));
+  EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,asumTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+  fill(&this->Array,TypeParam(-3));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,aminTest){
+  fill(&this->Array,TypeParam(100));
+  TypeParam tmp(-50);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amin(&this->Array));
+  tmp = TypeParam(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(48,amin(&this->Array));
+}
+
+TYPED_TEST(cuNDArray_blas_Real,amaxTest){
+  fill(&this->Array,TypeParam(1));
+  TypeParam tmp(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amax(&this->Array));
+  tmp = TypeParam(-50);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(48,amax(&this->Array));
+}
+
+
+template <typename T> class cuNDArray_blas_Cplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(cuNDArray_blas_Cplx, cplxImplementations);
+
+TYPED_TEST(cuNDArray_blas_Cplx,dotTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam res = dot(&this->Array,&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  fill(&this->Array2,TypeParam(2,2));
+  res = dot(&this->Array2,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(2,-2)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  res = dot(&this->Array,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),imag(res));
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,axpyTest){
+  fill(&this->Array,TypeParam(71.1,23.3));
+  fill(&this->Array2,TypeParam(97.9,654.2));
+  axpy(TypeParam(11.4),&this->Array,&this->Array2);
+  TypeParam got = this->Array2[546];
+  TypeParam wanted = TypeParam(71.1,23.3)*TypeParam(11.4)+TypeParam(97.9,654.2);
+  EXPECT_FLOAT_EQ(real(wanted),real(got));
+  EXPECT_FLOAT_EQ(imag(wanted),imag(got));
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,nrm2Test){
+  fill(&this->Array,TypeParam(1,1));
+  EXPECT_FLOAT_EQ(std::sqrt(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3.24,7.4));
+  // There will be rounding errors from the sum, so loosen comparison
+  EXPECT_NEAR(std::sqrt(real(TypeParam(3.24,-7.4)*TypeParam(3.24,7.4))*this->Array.get_number_of_elements()),nrm2(&this->Array),0.001);
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,asumTest){
+  fill(&this->Array,TypeParam(-3,1));
+  EXPECT_NEAR(4*this->Array.get_number_of_elements(),asum(&this->Array),0.1);
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,aminTest){
+  fill(&this->Array,TypeParam(100,101));
+  TypeParam tmp(-50,-51);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amin(&this->Array));
+  tmp = TypeParam(2,100);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(23,amin(&this->Array));
+  tmp = TypeParam(-2,-76);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[1000], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(1000,amin(&this->Array));
+}
+
+TYPED_TEST(cuNDArray_blas_Cplx,amaxTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam tmp(4,4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[768], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(768,amax(&this->Array));
+  tmp = TypeParam(6,1);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[48], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(768,amax(&this->Array));
+  tmp = TypeParam(-3,-6);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[999], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_EQ(999,amax(&this->Array));
+}
diff --git a/test/cuNDArray_elemwise_test.cpp b/test/cuNDArray_elemwise_test.cpp
new file mode 100644
index 0000000..d5c6b80
--- /dev/null
+++ b/test/cuNDArray_elemwise_test.cpp
@@ -0,0 +1,379 @@
+#include "cuNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_elemwise_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx2 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx3 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_elemwise_TestCplx4 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<typename realType<T>::Type>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<typename realType<T>::Type> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestReal, realImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,fillTest){
+  fill(&this->Array,TypeParam(1.1));
+  EXPECT_FLOAT_EQ(1.1,TypeParam(this->Array[5]));
+  fill(&this->Array,TypeParam(27.45));
+  EXPECT_FLOAT_EQ(27.45,TypeParam(this->Array[3242]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clearTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(1,TypeParam(this->Array[5324]));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,TypeParam(this->Array[5324]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,absTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array[13]));
+  EXPECT_FLOAT_EQ(TypeParam(5.5),TypeParam(abs(&this->Array)->at(13)));
+  fill(&this->Array,TypeParam(-1.3));
+  EXPECT_FLOAT_EQ(TypeParam(-1.3),TypeParam(this->Array[2454]));
+  abs_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1.3),TypeParam(this->Array[2454]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array[13]));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5*-5.5),TypeParam(abs_square(&this->Array)->at(13)));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,sqrtTest){
+  fill(&this->Array,TypeParam(17.9));
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(17.9)),TypeParam(sqrt(&this->Array)->at(23433)));
+  fill(&this->Array,TypeParam(3.14));
+  sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(3.14)),TypeParam(this->Array[32343]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,squareTest){
+  fill(&this->Array,TypeParam(1.7));
+  EXPECT_FLOAT_EQ(TypeParam(1.7)*TypeParam(1.7),TypeParam(square(&this->Array)->at(22542)));
+  fill(&this->Array,TypeParam(31.4));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(31.4)*TypeParam(31.4),TypeParam(this->Array[652252]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,reciprocalTest){
+  fill(&this->Array,TypeParam(11.7));
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(11.7),TypeParam(reciprocal(&this->Array)->at(45452)));
+  fill(&this->Array,TypeParam(314.114));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(314.114),TypeParam(this->Array[43432]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9));
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.9)),TypeParam(reciprocal_sqrt(&this->Array)->at(12345)));
+  fill(&this->Array,TypeParam(1.14));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.14)),TypeParam(this->Array[0]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,sgnTest){
+  fill(&this->Array,TypeParam(-5.7));
+  TypeParam tmp(101.1);
+  TypeParam tmp2(0.0);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[19100], &tmp2, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(sgn(&this->Array)->at(28)));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(sgn(&this->Array)->at(91)));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(sgn(&this->Array)->at(19100)));
+  fill(&this->Array,TypeParam(-5.7));
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[9100], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[19100], &tmp2, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  sgn_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(this->Array[2800]));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(this->Array[9100]));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(this->Array[19100]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clampTest){
+  fill(&this->Array,TypeParam(-5.7));
+  TypeParam tmp(101.3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[354222], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  clamp(&this->Array,TypeParam(4.9),TypeParam(100.0));
+  EXPECT_FLOAT_EQ(TypeParam(4.9),this->Array[3435]);
+  EXPECT_FLOAT_EQ(TypeParam(100.0),this->Array[354222]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7));
+  TypeParam tmp(-101.3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  clamp_min(&this->Array,TypeParam(-10.6));
+  EXPECT_FLOAT_EQ(TypeParam(-5.7),this->Array[28]);
+  EXPECT_FLOAT_EQ(TypeParam(-10.6),this->Array[91]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7));
+  TypeParam tmp(101.3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  clamp_max(&this->Array,TypeParam(10.6));
+  EXPECT_FLOAT_EQ(TypeParam(5.7),this->Array[28]);
+  EXPECT_FLOAT_EQ(TypeParam(10.6),this->Array[91]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,normalizeTest){
+  fill(&this->Array,TypeParam(50));
+  TypeParam tmp(-200);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(TypeParam(50)*TypeParam(110)/abs(TypeParam(-200)),this->Array[12345]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,shrink1Test){
+  fill(&this->Array,TypeParam(1.2));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/abs(TypeParam(1.2))*std::max(abs(TypeParam(1.2))-0.75,0.0),this->Array[125]);
+  fill(&this->Array,TypeParam(1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array[125]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2));
+  fill(&this->Array2,TypeParam(4.0));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/TypeParam(4.0)*std::max(4.0-1.0,0.0),this->Array[125]);
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array[125]);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,realTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,imagTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(0.0),imag(&this->Array)->at(125));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestReal,conjTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+}
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestCplx, cplxImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,fillTest){
+  fill(&this->Array,TypeParam(1.1,2.2));
+  EXPECT_FLOAT_EQ(1.1,real(TypeParam(this->Array[52323])));
+  EXPECT_FLOAT_EQ(2.2,imag(TypeParam(this->Array[52323])));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clearTest){
+  fill(&this->Array,TypeParam(1,1));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,real(TypeParam(this->Array[325])));
+  EXPECT_FLOAT_EQ(0,imag(TypeParam(this->Array[325])));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,absTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(std::sqrt(5.5*5.5+7.7*7.7),abs(&this->Array)->at(32113));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(5.5*5.5+7.7*7.7,abs_square(&this->Array)->at(32113));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,sqrtTest){
+  fill(&this->Array,TypeParam(17.9,3.5));
+  EXPECT_NEAR(real(sqrt(TypeParam(17.9,3.5))),real(sqrt(&this->Array)->at(2131)),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(17.9,3.5))),imag(sqrt(&this->Array)->at(2131)),0.00001);
+  fill(&this->Array,TypeParam(3.14,4.13));
+  sqrt_inplace(&this->Array);
+  EXPECT_NEAR(real(sqrt(TypeParam(3.14,4.13))),real(this->Array[120000]),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(3.14,4.13))),imag(this->Array[120000]),0.00001);
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,squareTest){
+  fill(&this->Array,TypeParam(1.7,7.1));
+  EXPECT_FLOAT_EQ(real(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),real(square(&this->Array)->at(22123)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),imag(square(&this->Array)->at(22123)));
+  fill(&this->Array,TypeParam(31.4,4.31));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),real(this->Array[51234]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),imag(this->Array[51234]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,reciprocalTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.9,2.7)),real(reciprocal(&this->Array)->at(11232)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.9,2.7)),imag(reciprocal(&this->Array)->at(11232)));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.14,4.32)),real(this->Array[10]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.14,4.32)),imag(this->Array[10]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),real(reciprocal_sqrt(&this->Array)->at(12543)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),imag(reciprocal_sqrt(&this->Array)->at(12543)));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),real(this->Array[10000]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),imag(this->Array[10000]));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,realImagTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(&this->Array)->at(33425));
+  EXPECT_FLOAT_EQ(4.2,imag(&this->Array)->at(45));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,conjTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(conj(&this->Array)->at(33425)));
+  EXPECT_FLOAT_EQ(-4.2,imag(conj(&this->Array)->at(45)));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,normalizeTest){
+  fill(&this->Array,TypeParam(50,50));
+  TypeParam tmp(-200,-200);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[23], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(real(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),real(&this->Array)->at(12345));
+  EXPECT_FLOAT_EQ(imag(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),imag(&this->Array)->at(12345));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clampTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  TypeParam tmp(101.3,203.4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[354222], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));  
+  clamp(&this->Array,real(TypeParam(4.9,0)),real(TypeParam(100.0,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(4.9,0)),real(&this->Array)->at(3435));
+  EXPECT_FLOAT_EQ(real(TypeParam(100.0,0)),real(&this->Array)->at(354222));
+  EXPECT_FLOAT_EQ(imag(TypeParam(4.9,0)),imag(&this->Array)->at(3435));
+  EXPECT_FLOAT_EQ(imag(TypeParam(100.0,0)),imag(&this->Array)->at(354222));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  TypeParam tmp(-101.3,-203.4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));  
+  clamp_min(&this->Array, real(TypeParam(-10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(-5.7,0)),real(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(real(TypeParam(-10.6,0)),real(&this->Array)->at(91));
+  EXPECT_FLOAT_EQ(imag(TypeParam(-5.7,0)),imag(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(imag(TypeParam(-10.6,0)),imag(&this->Array)->at(91));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7, 4.6));
+  TypeParam tmp(101.3,203.4);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[91], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));  
+  clamp_max(&this->Array,real(TypeParam(10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(5.7,0)),real(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(real(TypeParam(10.6,0)),real(&this->Array)->at(91));
+  EXPECT_FLOAT_EQ(imag(TypeParam(5.7,0)),imag(&this->Array)->at(28));
+  EXPECT_FLOAT_EQ(imag(TypeParam(10.6,0)),imag(&this->Array)->at(91));
+}
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx,shrink1Test){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),imag(&this->Array)->at(125));
+  fill(&this->Array,TypeParam(1,1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->at(23125));
+}
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestCplx4, cplxImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx4,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  fill(&this->Array2,real(TypeParam(4.0,4.0)));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/real(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/imag(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),imag(&this->Array)->at(125));
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->at(23125));
+}
+
+TYPED_TEST_CASE(cuNDArray_elemwise_TestCplx3, cplxtImplementations);
+
+TYPED_TEST(cuNDArray_elemwise_TestCplx3,realToCplxTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(real_to_complex<TypeParam>(real(&this->Array).get())->at(33425)));
+  EXPECT_FLOAT_EQ(0.0,imag(real_to_complex<TypeParam>(real(&this->Array).get())->at(33425)));
+}
diff --git a/test/cuNDArray_operators_test.cpp b/test/cuNDArray_operators_test.cpp
new file mode 100644
index 0000000..de22a09
--- /dev/null
+++ b/test/cuNDArray_operators_test.cpp
@@ -0,0 +1,243 @@
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_operators_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_operators_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(cuNDArray_operators_TestReal, realImplementations);
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435);
+  TypeParam v2 = TypeParam(13784.34);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4);
+  TypeParam v2 = TypeParam(2.2);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(98475334.34);
+  TypeParam v2 = TypeParam(2452.234);
+  unsigned int idx = 124999;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(4.4);
+  TypeParam v2 = TypeParam(9212.21);
+  unsigned int idx = 122131;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(342.145);
+  TypeParam v2 = TypeParam(43545.43);
+  unsigned int idx = 12344;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(43534.443);
+  TypeParam v2 = TypeParam(92.842);
+  unsigned int idx = 96735;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsDivideTest1){
+  TypeParam v1 = TypeParam(644.24);
+  TypeParam v2 = TypeParam(38564.64);
+  unsigned int idx = 98322;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array[idx]);
+}
+
+TYPED_TEST(cuNDArray_operators_TestReal,equalsDivideTest2){
+  TypeParam v1 = TypeParam(56342.24);
+  TypeParam v2 = TypeParam(23434.34);
+  unsigned int idx = 91;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array[idx]);
+}
+
+TYPED_TEST_CASE(cuNDArray_operators_TestCplx, cplxImplementations);
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsAddTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array += real(v2);
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsSubtractTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array -= real(v2);
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsMultiplyTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array *= real(v2);
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsDivideTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsDivideTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array[idx]));
+}
+
+TYPED_TEST(cuNDArray_operators_TestCplx,equalsDivideTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 1295;
+  fill(&this->Array,v1);
+  this->Array /= real(v2);
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array[idx]));
+}
diff --git a/test/cuNDArray_test.cpp b/test/cuNDArray_test.cpp
new file mode 100644
index 0000000..9e7b69f
--- /dev/null
+++ b/test/cuNDArray_test.cpp
@@ -0,0 +1,83 @@
+/*
+ * cuNDArray_test.cpp
+ *
+ *  Created on: Mar 1, 2013
+ *      Author: Dae
+ */
+
+
+#include "gtest/gtest.h"
+#include "cuNDArray.h"
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+
+template <typename T> class cuNDArray_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 unsigned int vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+		 Array =cuNDArray<T>(&dims);
+		 Array2 =cuNDArray<T>(&dims);
+
+
+	}
+	 std::vector<unsigned int> dims;
+	 cuNDArray<T> Array;
+	 cuNDArray<T> Array2;
+
+};
+
+typedef Types<float,double,float_complext,double_complext> Implementations;
+
+TYPED_TEST_CASE(cuNDArray_Test, Implementations);
+
+TYPED_TEST(cuNDArray_Test,fillTest){
+	this->Array.fill(TypeParam(1));
+	TypeParam res = this->Array.get_device_ptr()[5];
+	EXPECT_FLOAT_EQ(1,real(res));
+	this->Array.fill(TypeParam(27));
+	res = this->Array.get_device_ptr()[42];
+	EXPECT_FLOAT_EQ(27,real(res));
+}
+
+
+TYPED_TEST(cuNDArray_Test,clearTest){
+	this->Array.fill(TypeParam(1));
+	TypeParam res = this->Array.get_device_ptr()[5];
+	EXPECT_FLOAT_EQ(1,real(res));
+	this->Array.clear();
+	res = this->Array.get_device_ptr()[5];
+	EXPECT_FLOAT_EQ(0,real(res));
+}
+
+TYPED_TEST(cuNDArray_Test,equalsMultiplyTest){
+	this->Array.fill(TypeParam(2));
+	this->Array2.fill(TypeParam(4));
+	this->Array *= this->Array2;
+	TypeParam res = this->Array.get_device_ptr()[105];
+	EXPECT_FLOAT_EQ(8,real(res));
+
+}
+
+TYPED_TEST(cuNDArray_Test,absTest){
+	this->Array.fill(TypeParam(2.2));
+	this->Array.abs();
+	TypeParam res = this->Array.get_device_ptr()[121];
+	EXPECT_FLOAT_EQ(real(res),2.2);
+	this->Array.fill(TypeParam(-2.2));
+	this->Array.abs();
+	res = this->Array.get_device_ptr()[121];
+	EXPECT_FLOAT_EQ(real(res),2.2);
+}
+
+
+TYPED_TEST(cuNDArray_Test,sqrtTest){
+	this->Array.fill(TypeParam(12.1));
+	this->Array.sqrt();
+	TypeParam res = this->Array.get_device_ptr()[121];
+	EXPECT_FLOAT_EQ(real(res),3.478505426);
+
+}
diff --git a/test/cuNDArray_utils_test.cpp b/test/cuNDArray_utils_test.cpp
new file mode 100644
index 0000000..e6c5e0e
--- /dev/null
+++ b/test/cuNDArray_utils_test.cpp
@@ -0,0 +1,241 @@
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+#include "vector_td_utilities.h"
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class cuNDArray_utils_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+template <typename T> class cuNDArray_utils_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = cuNDArray<T>(&dims);
+    Array2 = cuNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  cuNDArray<T> Array;
+  cuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(cuNDArray_utils_TestReal, realImplementations);
+
+TYPED_TEST(cuNDArray_utils_TestReal,permuteTest){
+
+  fill(&this->Array,TypeParam(1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+  
+  TypeParam tmp(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, permute(&this->Array,&order)->at(0));
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(37));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(1));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(19));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(851));
+}
+
+TYPED_TEST(cuNDArray_utils_TestReal,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1));
+
+  TypeParam tmp(2);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, shift_dim(&this->Array,0)->at(0));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,0)->at(37));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,1)->at(1));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,-1)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,2)->at(23*37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,3)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,4)->at(37));
+}
+
+TYPED_TEST(cuNDArray_utils_TestReal,sumTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(49*v1,sum(&this->Array,1)->at(idx));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(23*v1,sum(&this->Array,2)->at(idx));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(19*v1,sum(&this->Array,3)->at(idx));
+}
+
+
+TYPED_TEST(cuNDArray_utils_TestReal,meanTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(v1,mean(&this->Array));
+
+}
+TYPED_TEST_CASE(cuNDArray_utils_TestCplx, cplxImplementations);
+
+
+
+TYPED_TEST(cuNDArray_utils_TestCplx,meanTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(v1),real(mean(&this->Array)));
+
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,permuteTest){
+  
+  fill(&this->Array,TypeParam(1,1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+  
+  TypeParam tmp(2,3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, real(permute(&this->Array,&order)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(permute(&this->Array,&order)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(37)));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(1)));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(19)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(19)));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(851)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(851)));
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1,1));
+
+  TypeParam tmp(2,3);
+  CUDA_CALL(cudaMemcpy(&this->Array.get_data_ptr()[37], &tmp, sizeof(TypeParam), cudaMemcpyHostToDevice));
+
+  EXPECT_FLOAT_EQ(1, real(shift_dim(&this->Array,0)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(shift_dim(&this->Array,0)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,0)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,0)->at(37)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,1)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,1)->at(1)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,-1)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,-1)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,2)->at(23*37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,2)->at(23*37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,3)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,3)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,4)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,4)->at(37)));
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,sumTest){
+  TypeParam v1 = TypeParam(12.34, 56.78);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(49)*v1),real(sum(&this->Array,1)->at(idx)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(49)*v1),imag(sum(&this->Array,1)->at(idx)));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(23)*v1),real(sum(&this->Array,2)->at(idx)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(23)*v1),imag(sum(&this->Array,2)->at(idx)));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(19)*v1),real(sum(&this->Array,3)->at(idx)));
+  EXPECT_FLOAT_EQ(imag(TypeParam(19)*v1),imag(sum(&this->Array,3)->at(idx)));
+}
+
+TYPED_TEST(cuNDArray_utils_TestCplx,padTest){
+  TypeParam v1 = TypeParam(12.34, 56.78);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+
+  vector_td<size_t,4> size = from_std_vector<size_t,4>(this->dims);
+  size *= 2;
+
+  boost::shared_ptr<cuNDArray<TypeParam> > out = pad<TypeParam,4>(size,&this->Array);
+
+  double scale = std::pow(2.0,4);
+  EXPECT_EQ(out->get_number_of_elements(),this->Array.get_number_of_elements()*scale);
+  EXPECT_FLOAT_EQ(real(mean(out.get()))*scale,real(mean(&this->Array)));
+  EXPECT_FLOAT_EQ(imag(mean(out.get()))*scale,imag(mean(&this->Array)));
+}
+
+
+TEST(padTest,largeSize){
+// So, this test is mainly here because pad apparently fails for large sized arrays.
+	size_t vdims[] = {192,192,50};
+	std::vector<size_t> dims(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+	size_t vdims2[] = {256,256,256};
+	std::vector<size_t> dims2(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+
+	cuNDArray<float_complext> in(&dims);
+	fill(&in,float_complext(1));
+	cuNDArray<float_complext> out(&dims2);
+
+	pad<float_complext,3>(&in,&out);
+
+	EXPECT_FLOAT_EQ(nrm2(&in),nrm2(&out));
+
+}
diff --git a/test/cuVector_td_test_kernels.cu b/test/cuVector_td_test_kernels.cu
new file mode 100644
index 0000000..c10485d
--- /dev/null
+++ b/test/cuVector_td_test_kernels.cu
@@ -0,0 +1,237 @@
+#include "cuVector_td_test_kernels.h"
+#include "check_CUDA.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray.h"
+#include "cudaDeviceManager.h"
+#include "thrust/device_vector.h"
+
+
+using namespace Gadgetron;
+template<class T, unsigned int D> __global__ void abs_kernel(vector_td<T,D>* data, unsigned int size){
+	 const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	 if (idx < size) data[idx] = abs(data[idx]);
+}
+
+
+template<class T, unsigned int D> void Gadgetron::test_abs(cuNDArray< vector_td<T,D> >* data){
+
+	dim3 dimBlock(std::min(cudaDeviceManager::Instance()->max_griddim(),(int)data->get_number_of_elements()));
+	dim3 dimGrid((dimBlock.x-1)/data->get_number_of_elements()+1);
+	abs_kernel<<<dimGrid,dimBlock>>>(data->get_data_ptr(),data->get_number_of_elements());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+}
+
+
+template<typename T, unsigned int D>
+struct test_norm_functor : public thrust::unary_function<T,vector_td<T,D> >
+{
+ __host__ __device__ T operator()(const vector_td<T,D> &x) const {return norm(x);}
+};
+template<class T, unsigned int D> thrust::device_vector<T> Gadgetron::test_norm(cuNDArray< vector_td<T,D> >* data){
+
+	thrust::device_vector<T> out(data->get_number_of_elements());
+	thrust::transform(data->begin(),data->end(),out.begin(),test_norm_functor<T,D>());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return out;
+}
+
+
+
+template<typename T, unsigned int D>
+struct test_min_functor : public thrust::unary_function<T,vector_td<T,D> >
+{
+ __host__ __device__ T operator()(const vector_td<T,D> &x) const {return min(x);}
+};
+template<class T, unsigned int D> thrust::device_vector<T> Gadgetron::test_min(cuNDArray< vector_td<T,D> >* data){
+
+	thrust::device_vector<T> out(data->get_number_of_elements());
+	thrust::transform(data->begin(),data->end(),out.begin(),test_min_functor<T,D>());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return out;
+}
+
+
+template<typename T, unsigned int D>
+struct test_max_functor : public thrust::unary_function<T,vector_td<T,D> >
+{
+ __host__ __device__ T operator()(const vector_td<T,D> &x) const {return max(x);}
+};
+template<class T, unsigned int D> thrust::device_vector<T> Gadgetron::test_max(cuNDArray< vector_td<T,D> >* data){
+
+	thrust::device_vector<T> out(data->get_number_of_elements());
+	thrust::transform(data->begin(),data->end(),out.begin(),test_max_functor<T,D>());
+	cudaThreadSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return out;
+}
+
+template<typename T, unsigned int D>
+struct test_amin_functor : public thrust::binary_function<vector_td<T,D>, vector_td<T,D>, vector_td<T,D> >
+{
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x, const vector_td<T,D> &y) const {return amin(x,y);}
+
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amin(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),data2->begin(),out->begin(),test_amin_functor<T,D>());
+	return out;
+}
+
+
+template<typename T, unsigned int D>
+struct test_amax_functor : public thrust::binary_function<vector_td<T,D>, vector_td<T,D>, vector_td<T,D> >
+{
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x, const vector_td<T,D> &y) const {return amax(x,y);}
+
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amax(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),data2->begin(),out->begin(),test_amax_functor<T,D>());
+	return out;
+}
+
+template<typename T, unsigned int D>
+class test_amin2_functor : public thrust::unary_function<vector_td<T,D>, vector_td<T,D> >
+{
+public:
+	test_amin2_functor(T _val): val(_val){};
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x) const {return amin(x,val);}
+	T val;
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amin2(cuNDArray< vector_td<T,D> >* data1, T val){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),out->begin(),test_amin2_functor<T,D>(val));
+	return out;
+}
+
+
+template<typename T, unsigned int D>
+class test_amax2_functor : public thrust::unary_function<vector_td<T,D>, vector_td<T,D> >
+{
+public:
+	test_amax2_functor(T _val): val(_val){};
+	__host__ __device__ vector_td<T,D> operator()(const vector_td<T,D> &x) const {return amax(x,val);}
+	T val;
+};
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > Gadgetron::test_amax2(cuNDArray< vector_td<T,D> >* data1, T val){
+	boost::shared_ptr<cuNDArray<vector_td<T,D> > > out( new cuNDArray<vector_td<T,D> >(data1->get_dimensions()));
+	thrust::transform(data1->begin(),data1->end(),out->begin(),test_amax2_functor<T,D>(val));
+	return out;
+}
+
+
+
+template<class T, unsigned int D> void Gadgetron::vector_fill(cuNDArray< vector_td<T,D> >* data,  vector_td<T,D> val){
+	thrust::fill(data->begin(),data->end(),val);
+}
+
+
+template void Gadgetron::test_abs<float,1>(cuNDArray< vector_td<float,1> > *);
+template void Gadgetron::test_abs<float,2>(cuNDArray< vector_td<float,2> > *);
+template  void Gadgetron::test_abs<float,3>(cuNDArray< vector_td<float,3> > *);
+template  void Gadgetron::test_abs<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  void Gadgetron::test_abs<double,1>(cuNDArray< vector_td<double,1> > *);
+template void Gadgetron::test_abs<double,2>(cuNDArray< vector_td<double,2> > *);
+template void Gadgetron::test_abs<double,3>(cuNDArray< vector_td<double,3> > *);
+template void Gadgetron::test_abs<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+template thrust::device_vector<float> Gadgetron::test_norm<float,1>(cuNDArray< vector_td<float,1> > *);
+template thrust::device_vector<float> Gadgetron::test_norm<float,2>(cuNDArray< vector_td<float,2> > *);
+template  thrust::device_vector<float> Gadgetron::test_norm<float,3>(cuNDArray< vector_td<float,3> > *);
+template  thrust::device_vector<float> Gadgetron::test_norm<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  thrust::device_vector<double> Gadgetron::test_norm<double,1>(cuNDArray< vector_td<double,1> > *);
+template thrust::device_vector<double> Gadgetron::test_norm<double,2>(cuNDArray< vector_td<double,2> > *);
+template thrust::device_vector<double> Gadgetron::test_norm<double,3>(cuNDArray< vector_td<double,3> > *);
+template thrust::device_vector<double> Gadgetron::test_norm<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+template thrust::device_vector<float> Gadgetron::test_min<float,1>(cuNDArray< vector_td<float,1> > *);
+template thrust::device_vector<float> Gadgetron::test_min<float,2>(cuNDArray< vector_td<float,2> > *);
+template  thrust::device_vector<float> Gadgetron::test_min<float,3>(cuNDArray< vector_td<float,3> > *);
+template  thrust::device_vector<float> Gadgetron::test_min<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  thrust::device_vector<double> Gadgetron::test_min<double,1>(cuNDArray< vector_td<double,1> > *);
+template thrust::device_vector<double> Gadgetron::test_min<double,2>(cuNDArray< vector_td<double,2> > *);
+template thrust::device_vector<double> Gadgetron::test_min<double,3>(cuNDArray< vector_td<double,3> > *);
+template thrust::device_vector<double> Gadgetron::test_min<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+template thrust::device_vector<float> Gadgetron::test_max<float,1>(cuNDArray< vector_td<float,1> > *);
+template thrust::device_vector<float> Gadgetron::test_max<float,2>(cuNDArray< vector_td<float,2> > *);
+template  thrust::device_vector<float> Gadgetron::test_max<float,3>(cuNDArray< vector_td<float,3> > *);
+template  thrust::device_vector<float> Gadgetron::test_max<float,4>(cuNDArray< vector_td<float,4> > *);
+
+template  thrust::device_vector<double> Gadgetron::test_max<double,1>(cuNDArray< vector_td<double,1> > *);
+template thrust::device_vector<double> Gadgetron::test_max<double,2>(cuNDArray< vector_td<double,2> > *);
+template thrust::device_vector<double> Gadgetron::test_max<double,3>(cuNDArray< vector_td<double,3> > *);
+template thrust::device_vector<double> Gadgetron::test_max<double,4>(cuNDArray< vector_td<double,4> > *);
+
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amin<float,1>(cuNDArray< vector_td<float,1> > *,cuNDArray< vector_td<float,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amin<float,2>(cuNDArray< vector_td<float,2> > *, cuNDArray< vector_td<float,2> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amin<float,3>(cuNDArray< vector_td<float,3> > *, cuNDArray< vector_td<float,3> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amin<float,4>(cuNDArray< vector_td<float,4> > *, cuNDArray< vector_td<float,4> > *);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amin<double,1>(cuNDArray< vector_td<double,1> > *, cuNDArray< vector_td<double,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amin<double,2>(cuNDArray< vector_td<double,2> > *, cuNDArray< vector_td<double,2> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amin<double,3>(cuNDArray< vector_td<double,3> > *, cuNDArray< vector_td<double,3> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amin<double,4>(cuNDArray< vector_td<double,4> > *, cuNDArray< vector_td<double,4> > *);
+
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amin2<float,1>(cuNDArray< vector_td<float,1> > *, float );
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amin2<float,2>(cuNDArray< vector_td<float,2> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amin2<float,3>(cuNDArray< vector_td<float,3> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amin2<float,4>(cuNDArray< vector_td<float,4> > *, float);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amin2<double,1>(cuNDArray< vector_td<double,1> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amin2<double,2>(cuNDArray< vector_td<double,2> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amin2<double,3>(cuNDArray< vector_td<double,3> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amin2<double,4>(cuNDArray< vector_td<double,4> > *, double);
+
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amax<float,1>(cuNDArray< vector_td<float,1> > *,cuNDArray< vector_td<float,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amax<float,2>(cuNDArray< vector_td<float,2> > *, cuNDArray< vector_td<float,2> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amax<float,3>(cuNDArray< vector_td<float,3> > *, cuNDArray< vector_td<float,3> > *);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amax<float,4>(cuNDArray< vector_td<float,4> > *, cuNDArray< vector_td<float,4> > *);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amax<double,1>(cuNDArray< vector_td<double,1> > *, cuNDArray< vector_td<double,1> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amax<double,2>(cuNDArray< vector_td<double,2> > *, cuNDArray< vector_td<double,2> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amax<double,3>(cuNDArray< vector_td<double,3> > *, cuNDArray< vector_td<double,3> > *);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amax<double,4>(cuNDArray< vector_td<double,4> > *, cuNDArray< vector_td<double,4> > *);
+
+
+template boost::shared_ptr<cuNDArray<vector_td<float,1> > > Gadgetron::test_amax2<float,1>(cuNDArray< vector_td<float,1> > *, float );
+template boost::shared_ptr<cuNDArray<vector_td<float,2> > > Gadgetron::test_amax2<float,2>(cuNDArray< vector_td<float,2> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,3> > > Gadgetron::test_amax2<float,3>(cuNDArray< vector_td<float,3> > *, float);
+template  boost::shared_ptr<cuNDArray<vector_td<float,4> > > Gadgetron::test_amax2<float,4>(cuNDArray< vector_td<float,4> > *, float);
+
+template  boost::shared_ptr<cuNDArray<vector_td<double,1> > > Gadgetron::test_amax2<double,1>(cuNDArray< vector_td<double,1> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,2> > > Gadgetron::test_amax2<double,2>(cuNDArray< vector_td<double,2> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,3> > > Gadgetron::test_amax2<double,3>(cuNDArray< vector_td<double,3> > *, double);
+template boost::shared_ptr<cuNDArray<vector_td<double,4> > > Gadgetron::test_amax2<double,4>(cuNDArray< vector_td<double,4> > *, double);
+
+
+
+template void Gadgetron::vector_fill<float,1>(cuNDArray< vector_td<float,1> > *, vector_td<float,1>);
+template void Gadgetron::vector_fill<float,2>(cuNDArray< vector_td<float,2> > *, vector_td<float,2>);
+template void Gadgetron::vector_fill<float,3>(cuNDArray< vector_td<float,3> > *, vector_td<float,3>);
+template void Gadgetron::vector_fill<float,4>(cuNDArray< vector_td<float,4> > *, vector_td<float,4>);
+
+
+template void Gadgetron::vector_fill<double,1>(cuNDArray< vector_td<double,1> > *, vector_td<double,1>);
+template void Gadgetron::vector_fill<double,2>(cuNDArray< vector_td<double,2> > *, vector_td<double,2>);
+template void Gadgetron::vector_fill<double,3>(cuNDArray< vector_td<double,3> > *, vector_td<double,3>);
+template void Gadgetron::vector_fill<double,4>(cuNDArray< vector_td<double,4> > *, vector_td<double,4>);
diff --git a/test/cuVector_td_test_kernels.h b/test/cuVector_td_test_kernels.h
new file mode 100644
index 0000000..07280fb
--- /dev/null
+++ b/test/cuVector_td_test_kernels.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "vector_td.h"
+#include "cuNDArray.h"
+#include "thrust/device_vector.h"
+namespace Gadgetron{
+
+template<class T, unsigned int D> void vector_fill(cuNDArray< vector_td<T,D> >* data,  vector_td<T,D> val);
+template<class T, unsigned int D> void test_abs(cuNDArray< vector_td<T,D> >* data);
+template<class T, unsigned int D> thrust::device_vector<T> test_norm(cuNDArray< vector_td<T,D> >* data);
+template<class T, unsigned int D> thrust::device_vector<T> test_min(cuNDArray< vector_td<T,D> >* data);
+
+template<class T, unsigned int D> thrust::device_vector<T> test_max(cuNDArray< vector_td<T,D> >* data);
+
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amax(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2);
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amin(cuNDArray< vector_td<T,D> >* data1, cuNDArray< vector_td<T,D> >* data2);
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amin2(cuNDArray< vector_td<T,D> >* data, T val);
+template<class T, unsigned int D> boost::shared_ptr<cuNDArray<vector_td<T,D> > > test_amax2(cuNDArray< vector_td<T,D> >* data, T val);
+}
diff --git a/test/hoCuGTBLAS_test.cpp b/test/hoCuGTBLAS_test.cpp
new file mode 100644
index 0000000..78cdce7
--- /dev/null
+++ b/test/hoCuGTBLAS_test.cpp
@@ -0,0 +1,80 @@
+/*
+ * hoCuGTBLAS_test.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+#include "gtest/gtest.h"
+#include "hoCuNDArray_blas.h"
+#include <vector>
+using namespace Gadgetron;
+using testing::Types;
+template <typename T> class hoCuGTBLAS_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 unsigned int vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+		 Array =hoCuNDArray<T>(&dims);
+		 Array2 =hoCuNDArray<T>(&dims);
+
+
+	}
+	 std::vector<unsigned int> dims;
+	 hoCuNDArray<T> Array;
+	 hoCuNDArray<T> Array2;
+
+};
+
+typedef Types<float,double,float_complext,double_complext> Implementations;
+
+TYPED_TEST_CASE(hoCuGTBLAS_Test, Implementations);
+
+
+TYPED_TEST(hoCuGTBLAS_Test,dotTest){
+	this->Array.fill(TypeParam(1));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+
+	this->Array2.fill(TypeParam(2));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,axpyTest){
+	this->Array.fill(TypeParam(71));
+	this->Array2.fill(TypeParam(97));
+	axpy(TypeParam(11),&this->Array,&this->Array2);
+
+	TypeParam val = this->Array2.get_data_ptr()[10];
+	EXPECT_FLOAT_EQ(878,real(val));
+
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,nrm2Test){
+	this->Array.fill(TypeParam(1));
+	EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+	this->Array.fill(TypeParam(3));
+	EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,asumTest){
+	this->Array.fill(TypeParam(1));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+	this->Array.fill(TypeParam(-3));
+	EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(hoCuGTBLAS_Test,aminTest){
+	this->Array.fill(TypeParam(100));
+	this->Array.get_data_ptr()[23]=TypeParam(-50);
+	EXPECT_EQ(23,amin(&this->Array));
+	this->Array.get_data_ptr()[48]=TypeParam(2);
+	EXPECT_EQ(48,amin(&this->Array));
+
+}
+TYPED_TEST(hoCuGTBLAS_Test,amaxTest){
+	this->Array.fill(TypeParam(1));
+	this->Array.get_data_ptr()[23]=TypeParam(2);
+	EXPECT_EQ(23,amax(&this->Array));
+	this->Array.get_data_ptr()[48]=TypeParam(-50);
+	EXPECT_EQ(48,amax(&this->Array));
+
+}
diff --git a/test/hoCuNDArray_elemwise_test.cpp b/test/hoCuNDArray_elemwise_test.cpp
new file mode 100644
index 0000000..d09ca8c
--- /dev/null
+++ b/test/hoCuNDArray_elemwise_test.cpp
@@ -0,0 +1,144 @@
+#include "hoCuNDArray_blas.h"
+#include "hoCuNDArray_elemwise.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoCuNDArray_blas_Real : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    unsigned int vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+    Array = hoCuNDArray<T>(&dims);
+    Array2 = hoCuNDArray<T>(&dims);
+  }
+  std::vector<unsigned int> dims;
+  hoCuNDArray<T> Array;
+  hoCuNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+
+TYPED_TEST_CASE(hoCuNDArray_blas_Real, realImplementations);
+
+TYPED_TEST(hoCuNDArray_blas_Real,dotTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+  fill(&this->Array2,TypeParam(2));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,axpyTest){
+  fill(&this->Array,TypeParam(71));
+  fill(&this->Array2,TypeParam(97));
+  axpy(TypeParam(11),&this->Array,&this->Array2);
+  TypeParam val = this->Array2.get_data_ptr()[10];
+  EXPECT_FLOAT_EQ(878,real(val));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,nrm2Test){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3));
+  EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,asumTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+  fill(&this->Array,TypeParam(-3));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,aminTest){
+  fill(&this->Array,TypeParam(100));
+  this->Array.get_data_ptr()[23]=TypeParam(-50);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2);
+  EXPECT_EQ(48,amin(&this->Array));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Real,amaxTest){
+  fill(&this->Array,TypeParam(1));
+  this->Array.get_data_ptr()[23]=TypeParam(2);
+  EXPECT_EQ(23,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(-50);
+  EXPECT_EQ(48,amax(&this->Array));
+}
+
+
+template <typename T> class hoCuNDArray_blas_Cplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    unsigned int vdims[] = {37, 49}; //Using prime numbers for setup because they are messy
+    dims = std::vector<unsigned int>(vdims,vdims+sizeof(vdims)/sizeof(unsigned int));
+    Array = hoCuNDArray<T>(&dims);
+    Array2 = hoCuNDArray<T>(&dims);
+  }
+  std::vector<unsigned int> dims;
+  hoCuNDArray<T> Array;
+  hoCuNDArray<T> Array2;
+};
+
+typedef Types<float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoCuNDArray_blas_Cplx, cplxImplementations);
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,dotTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam res = dot(&this->Array,&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  fill(&this->Array2,TypeParam(2,2));
+  res = dot(&this->Array2,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(2,-2)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  res = dot(&this->Array,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),imag(res));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,axpyTest){
+  fill(&this->Array,TypeParam(71.1,23.3));
+  fill(&this->Array2,TypeParam(97.9,654.2));
+  axpy(TypeParam(11.4),&this->Array,&this->Array2);
+  TypeParam got = this->Array2.get_data_ptr()[546];
+  TypeParam wanted = TypeParam(71.1,23.3)*TypeParam(11.4)+TypeParam(97.9,654.2);
+  EXPECT_FLOAT_EQ(real(wanted),real(got));
+  EXPECT_FLOAT_EQ(imag(wanted),imag(got));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,nrm2Test){
+  fill(&this->Array,TypeParam(1,1));
+  EXPECT_FLOAT_EQ(std::sqrt(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3.24,7.4));
+  // There will be rounding errors from the sum, so loosen comparison
+  EXPECT_NEAR(std::sqrt(real(TypeParam(3.24,-7.4)*TypeParam(3.24,7.4))*this->Array.get_number_of_elements()),nrm2(&this->Array),0.001);
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,asumTest){
+  fill(&this->Array,TypeParam(-3,1));
+  EXPECT_NEAR(4*this->Array.get_number_of_elements(),asum(&this->Array),0.0001);
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,aminTest){
+  fill(&this->Array,TypeParam(100,101));
+  this->Array.get_data_ptr()[23]=TypeParam(-50,-51);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2,100);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[1000]=TypeParam(-2,-76);
+  EXPECT_EQ(1000,amin(&this->Array));
+}
+
+TYPED_TEST(hoCuNDArray_blas_Cplx,amaxTest){
+  fill(&this->Array,TypeParam(1,1));
+  this->Array.get_data_ptr()[768]=TypeParam(4,4);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(6,1);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[999]=TypeParam(-3,-6);
+  EXPECT_EQ(999,amax(&this->Array));
+}
diff --git a/test/hoNDArray_blas_test.cpp b/test/hoNDArray_blas_test.cpp
new file mode 100644
index 0000000..ae99f14
--- /dev/null
+++ b/test/hoNDArray_blas_test.cpp
@@ -0,0 +1,144 @@
+#include "hoNDArray_blas.h"
+#include "hoNDArray_elemwise.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_blas_Real : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+
+TYPED_TEST_CASE(hoNDArray_blas_Real, realImplementations);
+
+TYPED_TEST(hoNDArray_blas_Real,dotTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(dot(&this->Array,&this->Array)));
+  fill(&this->Array2,TypeParam(2));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*2,real(dot(&this->Array,&this->Array2)));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,axpyTest){
+  fill(&this->Array,TypeParam(71));
+  fill(&this->Array2,TypeParam(97));
+  axpy(TypeParam(11),&this->Array,&this->Array2);
+  TypeParam val = this->Array2.get_data_ptr()[10];
+  EXPECT_FLOAT_EQ(878,real(val));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,nrm2Test){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(std::sqrt((double)this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3));
+  EXPECT_FLOAT_EQ(std::sqrt(3.0*3.0*this->Array.get_number_of_elements()),nrm2(&this->Array));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,asumTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements(),real(asum(&this->Array)));
+  fill(&this->Array,TypeParam(-3));
+  EXPECT_FLOAT_EQ(this->Array.get_number_of_elements()*3,real(asum(&this->Array)));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,aminTest){
+  fill(&this->Array,TypeParam(100));
+  this->Array.get_data_ptr()[23]=TypeParam(-50);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2);
+  EXPECT_EQ(48,amin(&this->Array));
+}
+
+TYPED_TEST(hoNDArray_blas_Real,amaxTest){
+  fill(&this->Array,TypeParam(1));
+  this->Array.get_data_ptr()[23]=TypeParam(2);
+  EXPECT_EQ(23,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(-50);
+  EXPECT_EQ(48,amax(&this->Array));
+}
+
+
+template <typename T> class hoNDArray_blas_Cplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_blas_Cplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_blas_Cplx,dotTest){
+  fill(&this->Array,TypeParam(1,1));
+  TypeParam res = dot(&this->Array,&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  fill(&this->Array2,TypeParam(2,2));
+  res = dot(&this->Array2,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(2,-2)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(0,imag(res));
+  res = dot(&this->Array,&this->Array2);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),real(res));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,-1)*TypeParam(2,2))*this->Array.get_number_of_elements(),imag(res));
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,axpyTest){
+  fill(&this->Array,TypeParam(71.1,23.3));
+  fill(&this->Array2,TypeParam(97.9,654.2));
+  axpy(TypeParam(11.4),&this->Array,&this->Array2);
+  TypeParam got = this->Array2.get_data_ptr()[546];
+  TypeParam wanted = TypeParam(71.1,23.3)*TypeParam(11.4)+TypeParam(97.9,654.2);
+  EXPECT_FLOAT_EQ(real(wanted),real(got));
+  EXPECT_FLOAT_EQ(imag(wanted),imag(got));
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,nrm2Test){
+  fill(&this->Array,TypeParam(1,1));
+  EXPECT_FLOAT_EQ(std::sqrt(real(TypeParam(1,-1)*TypeParam(1,1))*this->Array.get_number_of_elements()),nrm2(&this->Array));
+  fill(&this->Array,TypeParam(3.24,7.4));
+  // There will be rounding errors from the sum, so loosen comparison
+  EXPECT_NEAR(std::sqrt(real(TypeParam(3.24,-7.4)*TypeParam(3.24,7.4))*this->Array.get_number_of_elements()),nrm2(&this->Array),0.001);
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,asumTest){
+  fill(&this->Array,TypeParam(-3,1));
+  EXPECT_NEAR(4*this->Array.get_number_of_elements(),asum(&this->Array),0.0001);
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,aminTest){
+  fill(&this->Array,TypeParam(100,101));
+  this->Array.get_data_ptr()[23]=TypeParam(-50,-51);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(2,100);
+  EXPECT_EQ(23,amin(&this->Array));
+  this->Array.get_data_ptr()[1000]=TypeParam(-2,-76);
+  EXPECT_EQ(1000,amin(&this->Array));
+}
+
+TYPED_TEST(hoNDArray_blas_Cplx,amaxTest){
+  fill(&this->Array,TypeParam(1,1));
+  this->Array.get_data_ptr()[768]=TypeParam(4,4);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[48]=TypeParam(6,1);
+  EXPECT_EQ(768,amax(&this->Array));
+  this->Array.get_data_ptr()[999]=TypeParam(-3,-6);
+  EXPECT_EQ(999,amax(&this->Array));
+}
diff --git a/test/hoNDArray_elemwise_test.cpp b/test/hoNDArray_elemwise_test.cpp
new file mode 100644
index 0000000..e904e54
--- /dev/null
+++ b/test/hoNDArray_elemwise_test.cpp
@@ -0,0 +1,379 @@
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_elemwise_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx2 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx3 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_elemwise_TestCplx4 : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<typename realType<T>::Type>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+  hoNDArray<typename realType<T>::Type> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+typedef Types<std::complex<float>, std::complex<double> > stdCplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,fillTest){
+  fill(&this->Array,TypeParam(1.1));
+  EXPECT_FLOAT_EQ(1.1,TypeParam(this->Array.get_data_ptr()[5]));
+  fill(&this->Array,TypeParam(27.45));
+  EXPECT_FLOAT_EQ(27.45,TypeParam(this->Array.get_data_ptr()[3242]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clearTest){
+  fill(&this->Array,TypeParam(1));
+  EXPECT_FLOAT_EQ(1,TypeParam(this->Array.get_data_ptr()[5324]));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,TypeParam(this->Array.get_data_ptr()[5324]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,absTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array.get_data_ptr()[13]));
+  EXPECT_FLOAT_EQ(TypeParam(5.5),TypeParam(abs(&this->Array)->get_data_ptr()[13]));
+  fill(&this->Array,TypeParam(-1.3));
+  EXPECT_FLOAT_EQ(TypeParam(-1.3),TypeParam(this->Array.get_data_ptr()[2454]));
+  abs_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1.3),TypeParam(this->Array.get_data_ptr()[2454]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5),TypeParam(this->Array.get_data_ptr()[13]));
+  EXPECT_FLOAT_EQ(TypeParam(-5.5*-5.5),TypeParam(abs_square(&this->Array)->get_data_ptr()[13]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,sqrtTest){
+  fill(&this->Array,TypeParam(17.9));
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(17.9)),TypeParam(sqrt(&this->Array)->get_data_ptr()[23433]));
+  fill(&this->Array,TypeParam(3.14));
+  sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(std::sqrt(TypeParam(3.14)),TypeParam(this->Array.get_data_ptr()[32343]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,squareTest){
+  fill(&this->Array,TypeParam(1.7));
+  EXPECT_FLOAT_EQ(TypeParam(1.7)*TypeParam(1.7),TypeParam(square(&this->Array)->get_data_ptr()[22542]));
+  fill(&this->Array,TypeParam(31.4));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(31.4)*TypeParam(31.4),TypeParam(this->Array.get_data_ptr()[652252]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,reciprocalTest){
+  fill(&this->Array,TypeParam(11.7));
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(11.7),TypeParam(reciprocal(&this->Array)->get_data_ptr()[45452]));
+  fill(&this->Array,TypeParam(314.114));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/TypeParam(314.114),TypeParam(this->Array.get_data_ptr()[43432]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9));
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.9)),TypeParam(reciprocal_sqrt(&this->Array)->get_data_ptr()[12345]));
+  fill(&this->Array,TypeParam(1.14));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(1)/std::sqrt(TypeParam(1.14)),TypeParam(this->Array.get_data_ptr()[0]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,sgnTest){
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[91] = TypeParam(101.1);
+  this->Array.get_data_ptr()[19100] = TypeParam(0);
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(sgn(&this->Array)->get_data_ptr()[28]));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(sgn(&this->Array)->get_data_ptr()[91]));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(sgn(&this->Array)->get_data_ptr()[19100]));
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[9100] = TypeParam(101.1);
+  this->Array.get_data_ptr()[19100] = TypeParam(0);
+  sgn_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(TypeParam(-1),TypeParam(this->Array.get_data_ptr()[2800]));
+  EXPECT_FLOAT_EQ(TypeParam(1),TypeParam(this->Array.get_data_ptr()[9100]));
+  EXPECT_FLOAT_EQ(TypeParam(0),TypeParam(this->Array.get_data_ptr()[19100]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clampTest){
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[354222] = TypeParam(101.3);
+  clamp(&this->Array,TypeParam(4.9),TypeParam(100.0));
+  EXPECT_FLOAT_EQ(TypeParam(4.9),this->Array.get_data_ptr()[3435]);
+  EXPECT_FLOAT_EQ(TypeParam(100.0),this->Array.get_data_ptr()[354222]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7));
+  this->Array.get_data_ptr()[91] = TypeParam(-101.3);
+  clamp_min(&this->Array,TypeParam(-10.6));
+  EXPECT_FLOAT_EQ(TypeParam(-5.7),this->Array.get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(TypeParam(-10.6),this->Array.get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7));
+  this->Array.get_data_ptr()[91] = TypeParam(101.3);
+  clamp_max(&this->Array,TypeParam(10.6));
+  EXPECT_FLOAT_EQ(TypeParam(5.7),this->Array.get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(TypeParam(10.6),this->Array.get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,normalizeTest){
+  fill(&this->Array,TypeParam(50));
+  this->Array.get_data_ptr()[23]=TypeParam(-200);
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(TypeParam(50)*TypeParam(110)/abs(TypeParam(-200)),this->Array.get_data_ptr()[12345]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,shrink1Test){
+  fill(&this->Array,TypeParam(1.2));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/abs(TypeParam(1.2))*std::max(abs(TypeParam(1.2))-0.75,0.0),this->Array.get_data_ptr()[125]);
+  fill(&this->Array,TypeParam(1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array.get_data_ptr()[125]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2));
+  fill(&this->Array2,TypeParam(4.0));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(TypeParam(1.2)/TypeParam(4.0)*std::max(4.0-1.0,0.0),this->Array.get_data_ptr()[125]);
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,this->Array.get_data_ptr()[125]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,realTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,imagTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(0.0),imag(&this->Array)->at(125));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestReal,conjTest){
+  fill(&this->Array,TypeParam(1.2));
+  EXPECT_FLOAT_EQ(TypeParam(1.2),real(&this->Array)->at(125));
+  EXPECT_FLOAT_EQ(TypeParam(0.0),imag(&this->Array)->at(125));
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,fillTest){
+  fill(&this->Array,TypeParam(1.1,2.2));
+  EXPECT_FLOAT_EQ(1.1,real(TypeParam(this->Array.get_data_ptr()[52323])));
+  EXPECT_FLOAT_EQ(2.2,imag(TypeParam(this->Array.get_data_ptr()[52323])));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clearTest){
+  fill(&this->Array,TypeParam(1,1));
+  clear(&this->Array);
+  EXPECT_FLOAT_EQ(0,real(TypeParam(this->Array.get_data_ptr()[325])));
+  EXPECT_FLOAT_EQ(0,imag(TypeParam(this->Array.get_data_ptr()[325])));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,absTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(std::sqrt(5.5*5.5+7.7*7.7),abs(&this->Array)->get_data_ptr()[32113]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,absSquareTest){
+  fill(&this->Array,TypeParam(-5.5,7.7));
+  EXPECT_FLOAT_EQ(5.5*5.5+7.7*7.7,abs_square(&this->Array)->get_data_ptr()[32113]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,sqrtTest){
+  fill(&this->Array,TypeParam(17.9,3.5));
+  EXPECT_NEAR(real(sqrt(TypeParam(17.9,3.5))),real(sqrt(&this->Array)->get_data_ptr()[2131]),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(17.9,3.5))),imag(sqrt(&this->Array)->get_data_ptr()[2131]),0.00001);
+  fill(&this->Array,TypeParam(3.14,4.13));
+  sqrt_inplace(&this->Array);
+  EXPECT_NEAR(real(sqrt(TypeParam(3.14,4.13))),real(this->Array.get_data_ptr()[120000]),0.00001);
+  EXPECT_NEAR(imag(sqrt(TypeParam(3.14,4.13))),imag(this->Array.get_data_ptr()[120000]),0.00001);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,squareTest){
+  fill(&this->Array,TypeParam(1.7,7.1));
+  EXPECT_FLOAT_EQ(real(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),real(square(&this->Array)->get_data_ptr()[22123]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.7,7.1)*TypeParam(1.7,7.1)),imag(square(&this->Array)->get_data_ptr()[22123]));
+  fill(&this->Array,TypeParam(31.4,4.31));
+  square_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),real(this->Array.get_data_ptr()[51234]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(31.4,4.31)*TypeParam(31.4,4.31)),imag(this->Array.get_data_ptr()[51234]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,reciprocalTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.9,2.7)),real(reciprocal(&this->Array)->get_data_ptr()[11232]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.9,2.7)),imag(reciprocal(&this->Array)->get_data_ptr()[11232]));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/TypeParam(1.14,4.32)),real(this->Array.get_data_ptr()[10]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/TypeParam(1.14,4.32)),imag(this->Array.get_data_ptr()[10]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,reciprocal_sqrtTest){
+  fill(&this->Array,TypeParam(1.9,2.7));
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),real(reciprocal_sqrt(&this->Array)->get_data_ptr()[12543]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.9,2.7))),imag(reciprocal_sqrt(&this->Array)->get_data_ptr()[12543]));
+  fill(&this->Array,TypeParam(1.14,4.32));
+  reciprocal_sqrt_inplace(&this->Array);
+  EXPECT_FLOAT_EQ(real(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),real(this->Array.get_data_ptr()[10000]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(1,0)/sqrt(TypeParam(1.14,4.32))),imag(this->Array.get_data_ptr()[10000]));
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,realImagTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(&this->Array)->get_data_ptr()[33425]);
+  EXPECT_NEAR(4.2,imag(&this->Array)->get_data_ptr()[45], 0.000001);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,conjTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(conj(&this->Array)->at(33425)));
+  EXPECT_NEAR(-4.2,imag(conj(&this->Array)->at(45)), 0.000001);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,normalizeTest){
+  fill(&this->Array,TypeParam(50,50));
+  this->Array.get_data_ptr()[23]=TypeParam(-200,-200);
+  normalize(&this->Array,110);
+  EXPECT_FLOAT_EQ(real(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),real(&this->Array)->get_data_ptr()[12345]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(50,50)*real(TypeParam(110,110))/abs(TypeParam(-200,-200))),imag(&this->Array)->get_data_ptr()[12345]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clampTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  this->Array.get_data_ptr()[354222] = TypeParam(101.3,203.4);
+  clamp(&this->Array,real(TypeParam(4.9,0)),real(TypeParam(100.0,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(4.9,0)),real(&this->Array)->get_data_ptr()[3435]);
+  EXPECT_FLOAT_EQ(real(TypeParam(100.0,0)),real(&this->Array)->get_data_ptr()[354222]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(4.9,0)),imag(&this->Array)->get_data_ptr()[3435]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(100.0,0)),imag(&this->Array)->get_data_ptr()[354222]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clamp_minTest){
+  fill(&this->Array,TypeParam(-5.7, -4.6));
+  this->Array.get_data_ptr()[91] = TypeParam(-101.3, -203.4);
+  clamp_min(&this->Array, real(TypeParam(-10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(-5.7,0)),real(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(real(TypeParam(-10.6,0)),real(&this->Array)->get_data_ptr()[91]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(-5.7,0)),imag(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(-10.6,0)),imag(&this->Array)->get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,clamp_maxTest){
+  fill(&this->Array,TypeParam(5.7, 4.6));
+  this->Array.get_data_ptr()[91] = TypeParam(101.3, 203.4);
+  clamp_max(&this->Array,real(TypeParam(10.6,0)));
+  EXPECT_FLOAT_EQ(real(TypeParam(5.7,0)),real(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(real(TypeParam(10.6,0)),real(&this->Array)->get_data_ptr()[91]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(5.7,0)),imag(&this->Array)->get_data_ptr()[28]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(10.6,0)),imag(&this->Array)->get_data_ptr()[91]);
+}
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx,shrink1Test){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  shrink1(&this->Array,0.75);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/abs(TypeParam(1.2,1.4)))*std::max(abs(TypeParam(1.2,1.4))-0.75,0.0),imag(&this->Array)->get_data_ptr()[125]);
+  fill(&this->Array,TypeParam(1,1));
+  shrink1(&this->Array,2.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->get_data_ptr()[23125]);
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx4, cplxImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx4,shrinkdTest){
+  fill(&this->Array,TypeParam(1.2,1.4));
+  fill(&this->Array2,real(TypeParam(4.0,4.0)));
+  shrinkd(&this->Array,&this->Array2,1.0);
+  EXPECT_FLOAT_EQ(real(TypeParam(1.2,1.4)/real(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(imag(TypeParam(1.2,1.4)/imag(TypeParam(4.0,4.0)))*std::max(4.0-1.0,0.0),imag(&this->Array)->get_data_ptr()[125]);
+  shrinkd(&this->Array,&this->Array2,8.0);
+  EXPECT_FLOAT_EQ(0.0,real(&this->Array)->get_data_ptr()[125]);
+  EXPECT_FLOAT_EQ(0.0,imag(&this->Array)->get_data_ptr()[23125]);
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx2, stdCplxImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx2,realToCplxTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+  EXPECT_FLOAT_EQ(0.0,imag(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+}
+
+TYPED_TEST_CASE(hoNDArray_elemwise_TestCplx3, cplxtImplementations);
+
+TYPED_TEST(hoNDArray_elemwise_TestCplx3,realToCplxTest){
+  fill(&this->Array,TypeParam(3.4,4.2));
+  EXPECT_FLOAT_EQ(3.4,real(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+  EXPECT_FLOAT_EQ(0.0,imag(real_to_complex<TypeParam>(real(&this->Array).get())->get_data_ptr()[33425]));
+}
diff --git a/test/hoNDArray_operators_test.cpp b/test/hoNDArray_operators_test.cpp
new file mode 100644
index 0000000..02739b9
--- /dev/null
+++ b/test/hoNDArray_operators_test.cpp
@@ -0,0 +1,250 @@
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_operators_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    size_t vdims2[] = {37, 49}; //Smaller dimensionality to test batch mode
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    dims2 = std::vector<size_t>(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims2);
+  }
+  std::vector<size_t> dims;
+  std::vector<size_t> dims2;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+template <typename T> class hoNDArray_operators_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    size_t vdims2[] = {37, 49}; //Smaller dimensionality to test batch mode
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    dims2 = std::vector<size_t>(vdims2,vdims2+sizeof(vdims2)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+    Array2 = hoNDArray<T>(&dims2);
+  }
+  std::vector<size_t> dims;
+  std::vector<size_t> dims2;
+  hoNDArray<T> Array;
+  hoNDArray<T> Array2;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_operators_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435);
+  TypeParam v2 = TypeParam(13784.34);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4);
+  TypeParam v2 = TypeParam(2.2);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(v1+v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(98475334.34);
+  TypeParam v2 = TypeParam(2452.234);
+  unsigned int idx = 124999;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(4.4);
+  TypeParam v2 = TypeParam(9212.21);
+  unsigned int idx = 122131;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(v1-v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(342.145);
+  TypeParam v2 = TypeParam(43545.43);
+  unsigned int idx = 12344;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(43534.443);
+  TypeParam v2 = TypeParam(92.842);
+  unsigned int idx = 96735;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(v1*v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsDivideTest1){
+  TypeParam v1 = TypeParam(644.24);
+  TypeParam v2 = TypeParam(38564.64);
+  unsigned int idx = 98322;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST(hoNDArray_operators_TestReal,equalsDivideTest2){
+  TypeParam v1 = TypeParam(56342.24);
+  TypeParam v2 = TypeParam(23434.34);
+  unsigned int idx = 12591;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(v1/v2,this->Array.get_data_ptr()[idx]);
+}
+
+TYPED_TEST_CASE(hoNDArray_operators_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array += this->Array2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array += v2;
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsAddTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array += real(v2);
+  EXPECT_FLOAT_EQ(real(v1+v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1+v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array -= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array -= v2;
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsSubtractTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array -= real(v2);
+  EXPECT_FLOAT_EQ(real(v1-v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1-v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array *= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12925;
+  fill(&this->Array,v1);
+  this->Array *= v2;
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsMultiplyTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array *= real(v2);
+  EXPECT_FLOAT_EQ(real(v1*v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1*v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest1){
+  TypeParam v1 = TypeParam(46865.35435, 534544.534523);
+  TypeParam v2 = TypeParam(13784.34, 54543543.1243);
+  unsigned int idx = 73243;
+  fill(&this->Array,v1);
+  fill(&this->Array2,v2);
+  this->Array /= this->Array2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest2){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,3.23);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array /= v2;
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
+TYPED_TEST(hoNDArray_operators_TestCplx,equalsDivideTest3){
+  TypeParam v1 = TypeParam(98.4, 45.34);
+  TypeParam v2 = TypeParam(2.2,0.0);
+  unsigned int idx = 12295;
+  fill(&this->Array,v1);
+  this->Array /= real(v2);
+  EXPECT_FLOAT_EQ(real(v1/v2),real(this->Array.get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(v1/v2),imag(this->Array.get_data_ptr()[idx]));
+}
+
diff --git a/test/hoNDArray_utils_test.cpp b/test/hoNDArray_utils_test.cpp
new file mode 100644
index 0000000..0ad32dc
--- /dev/null
+++ b/test/hoNDArray_utils_test.cpp
@@ -0,0 +1,173 @@
+#include "hoNDArray_utils.h"
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <vector>
+
+using namespace Gadgetron;
+using testing::Types;
+
+template <typename T> class hoNDArray_utils_TestReal : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+};
+
+template <typename T> class hoNDArray_utils_TestCplx : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+    size_t vdims[] = {37, 49, 23, 19}; //Using prime numbers for setup because they are messy
+    dims = std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+    Array = hoNDArray<T>(&dims);
+  }
+  std::vector<size_t> dims;
+  hoNDArray<T> Array;
+};
+
+typedef Types<float, double> realImplementations;
+typedef Types</*std::complex<float>, std::complex<double>,*/ float_complext, double_complext> cplxImplementations;
+
+TYPED_TEST_CASE(hoNDArray_utils_TestReal, realImplementations);
+
+TYPED_TEST(hoNDArray_utils_TestReal,permuteTest){
+
+  fill(&this->Array,TypeParam(1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+
+  this->Array.get_data_ptr()[37] = TypeParam(2);
+
+  EXPECT_FLOAT_EQ(1, permute(&this->Array,&order)->at(0));
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(37));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(1));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(19));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, permute(&this->Array,&order)->at(851));
+}
+
+TYPED_TEST(hoNDArray_utils_TestReal,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1));
+  this->Array.get_data_ptr()[37] = 2;
+
+  EXPECT_FLOAT_EQ(1, shift_dim(&this->Array,0)->at(0));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,0)->at(37));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,1)->at(1));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,-1)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,2)->at(23*37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,3)->at(37*19));
+  EXPECT_FLOAT_EQ(2, shift_dim(&this->Array,4)->at(37));
+}
+
+TYPED_TEST(hoNDArray_utils_TestReal,sumTest){
+  TypeParam v1 = TypeParam(12.34);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(49*v1,sum(&this->Array,1)->get_data_ptr()[idx]);
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(23*v1,sum(&this->Array,2)->get_data_ptr()[idx]);
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(19*v1,sum(&this->Array,3)->get_data_ptr()[idx]);
+}
+
+TYPED_TEST_CASE(hoNDArray_utils_TestCplx, cplxImplementations);
+
+TYPED_TEST(hoNDArray_utils_TestCplx,permuteTest){
+
+  fill(&this->Array,TypeParam(1,1));
+
+  std::vector<size_t> order;
+  order.push_back(0); order.push_back(1); order.push_back(2); order.push_back(3);
+  
+  this->Array.get_data_ptr()[37] = TypeParam(2,3);
+
+  EXPECT_FLOAT_EQ(1, real(permute(&this->Array,&order)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(permute(&this->Array,&order)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(37)));
+
+  order.clear();
+  order.push_back(1); order.push_back(0); order.push_back(2); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(1)));
+
+  order.clear();
+  order.push_back(3); order.push_back(1); order.push_back(2); order.push_back(0);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(19)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(19)));
+
+  order.clear();
+  order.push_back(2); order.push_back(0); order.push_back(1); order.push_back(3);
+
+  EXPECT_FLOAT_EQ(2, real(permute(&this->Array,&order)->at(851)));
+  EXPECT_FLOAT_EQ(3, imag(permute(&this->Array,&order)->at(851)));
+}
+
+TYPED_TEST(hoNDArray_utils_TestCplx,shiftDimTest){
+
+  fill(&this->Array,TypeParam(1,1));
+  this->Array.get_data_ptr()[37]=TypeParam(2,3);
+
+  EXPECT_FLOAT_EQ(1, real(shift_dim(&this->Array,0)->at(0)));
+  EXPECT_FLOAT_EQ(1, imag(shift_dim(&this->Array,0)->at(0)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,0)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,0)->at(37)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,1)->at(1)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,1)->at(1)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,-1)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,-1)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,2)->at(23*37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,2)->at(23*37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,3)->at(37*19)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,3)->at(37*19)));
+
+  EXPECT_FLOAT_EQ(2, real(shift_dim(&this->Array,4)->at(37)));
+  EXPECT_FLOAT_EQ(3, imag(shift_dim(&this->Array,4)->at(37)));
+}
+
+TYPED_TEST(hoNDArray_utils_TestCplx,sumTest){
+  TypeParam v1 = TypeParam(12.34, 56.78);
+  unsigned int idx = 0;
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(49)*v1),real(sum(&this->Array,1)->get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(49)*v1),imag(sum(&this->Array,1)->get_data_ptr()[idx]));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(23)*v1),real(sum(&this->Array,2)->get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(23)*v1),imag(sum(&this->Array,2)->get_data_ptr()[idx]));
+
+  fill(&this->Array,v1);
+  EXPECT_FLOAT_EQ(real(TypeParam(19)*v1),real(sum(&this->Array,3)->get_data_ptr()[idx]));
+  EXPECT_FLOAT_EQ(imag(TypeParam(19)*v1),imag(sum(&this->Array,3)->get_data_ptr()[idx]));
+}
diff --git a/test/tests.cpp b/test/tests.cpp
new file mode 100644
index 0000000..326bea1
--- /dev/null
+++ b/test/tests.cpp
@@ -0,0 +1,13 @@
+/*
+ * tests.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/vector_td_test.cpp b/test/vector_td_test.cpp
new file mode 100644
index 0000000..8fc3ed7
--- /dev/null
+++ b/test/vector_td_test.cpp
@@ -0,0 +1,141 @@
+/*
+ * cuGTBLAS_test.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+#include "gtest/gtest.h"
+
+
+#include <vector>
+#include "complext.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "vector_td_io.h"
+#include "cuVector_td_test_kernels.h"
+#include <sstream>
+using namespace Gadgetron;
+using testing::Types;
+template <typename T> class vector_td_Test : public ::testing::Test {
+	protected:
+	 virtual void SetUp() {
+		 size_t vdims[] = {37}; //Using prime numbers for setup because they are messy
+		 dims= std::vector<size_t>(vdims,vdims+sizeof(vdims)/sizeof(size_t));
+		 cuData = cuNDArray<vector_td<T,3> >(&dims);
+		 cuData2 = cuNDArray<vector_td<T,3> >(&dims);
+	}
+	 cuNDArray<vector_td<T,3> > cuData;
+	 cuNDArray<vector_td<T,3> > cuData2;
+	 std::vector<size_t> dims;
+
+
+};
+
+//typedef Types<float,double,float_complext,double_complext> Implementations;
+typedef Types<float,double> Implementations;
+
+TYPED_TEST_CASE(vector_td_Test, Implementations);
+
+
+TYPED_TEST(vector_td_Test,absTest){
+
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(-2));
+
+	test_abs(&this->cuData);
+	vector_td<TypeParam,3> expected(2);
+	vector_td<TypeParam,3> result = this->cuData.get_device_ptr()[2];
+	EXPECT_EQ(expected,result);
+}
+
+TYPED_TEST(vector_td_Test,normTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(12.1));
+
+	thrust::device_vector<TypeParam> out = test_norm(&this->cuData);
+
+	EXPECT_FLOAT_EQ(real(20.957814772),out[3]);
+}
+
+
+
+TYPED_TEST(vector_td_Test,minTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	thrust::device_vector<TypeParam> out = test_min(&this->cuData);
+
+	EXPECT_FLOAT_EQ(TypeParam(1.1),out[5]);
+}
+
+TYPED_TEST(vector_td_Test,maxTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	thrust::device_vector<TypeParam> out = test_max(&this->cuData);
+
+	EXPECT_FLOAT_EQ(TypeParam(5.3),out[5]);
+}
+
+
+TYPED_TEST(vector_td_Test,aminTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+	vector_fill(&this->cuData2,vector_td<TypeParam,3>(20.2,0.11,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amin(&this->cuData,&this->cuData2);
+	vector_td<TypeParam,3> expected(2.2,0.11,5.3);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[35]);
+}
+
+TYPED_TEST(vector_td_Test,amin2Test){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amin2(&this->cuData,TypeParam(4));
+	vector_td<TypeParam,3> expected(2.2,1.1,4);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[35]);
+}
+
+TYPED_TEST(vector_td_Test,amaxTest){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+	vector_fill(&this->cuData2,vector_td<TypeParam,3>(20.2,0.11,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amax(&this->cuData,&this->cuData2);
+	vector_td<TypeParam,3> expected(20.2,1.1,5.3);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[23]);
+}
+
+TYPED_TEST(vector_td_Test,amax2Test){
+	vector_fill(&this->cuData,vector_td<TypeParam,3>(2.2,1.1,5.3));
+
+	boost::shared_ptr<cuNDArray<vector_td<TypeParam,3> > > out = test_amax2(&this->cuData,TypeParam(4));
+	vector_td<TypeParam,3> expected(4,4,5.3);
+	boost::shared_ptr<hoNDArray<vector_td<TypeParam,3> > > host = out->to_host();
+	EXPECT_EQ(expected,host->begin()[26]);
+}
+
+TEST(vector_td,parseTest){
+std::string base ="[23,22,25]";
+std::stringstream ss(base);
+
+vector_td<float,3> vec;
+vector_td<float,3> res(23,22,25);
+ss >> vec;
+
+EXPECT_FALSE(ss.fail());
+EXPECT_EQ(res,vec);
+
+}
+
+
+TEST(vector_td,parseEqualTest){
+	vector_td<float,3> res(23,22,25);
+	std::stringstream ss;
+	ss << res;
+
+	vector_td<float,3> vec;
+
+	ss >> vec;
+
+	EXPECT_FALSE(ss.fail());
+	EXPECT_EQ(res,vec);
+
+}
diff --git a/toolboxes/CMakeLists.txt b/toolboxes/CMakeLists.txt
new file mode 100644
index 0000000..c305e3d
--- /dev/null
+++ b/toolboxes/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_subdirectory(core)
+
+IF(CUDA_FOUND OR ARMADILLO_FOUND)
+  add_subdirectory(operators)
+  add_subdirectory(solvers)
+ENDIF(CUDA_FOUND OR ARMADILLO_FOUND)
+
+add_subdirectory(mri)
+add_subdirectory(nfft)
+add_subdirectory(registration)
+
+IF (ACE_FOUND AND XSD_FOUND)
+  add_subdirectory(gadgettools)
+ENDIF (ACE_FOUND AND XSD_FOUND)
+
+IF (MKL_FOUND)
+    add_subdirectory(gtplus)
+ENDIF (MKL_FOUND)
\ No newline at end of file
diff --git a/toolboxes/core/CMakeLists.txt b/toolboxes/core/CMakeLists.txt
new file mode 100644
index 0000000..67756a0
--- /dev/null
+++ b/toolboxes/core/CMakeLists.txt
@@ -0,0 +1,26 @@
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  )
+
+configure_file(core_defines.h.in ${CMAKE_CURRENT_SOURCE_DIR}/core_defines.h)
+
+install(FILES 	
+  core_defines.h
+  NDArray.h
+  complext.h
+  vector_td.h
+  vector_td_operators.h
+  vector_td_utilities.h
+  vector_td_io.h
+  real_utilities.h
+  GadgetronCommon.h
+  GadgetronException.h
+  GadgetronTimer.h
+  SerializableObject.h
+  DESTINATION include)
+
+add_subdirectory(cpu)
+
+if (CUDA_FOUND)
+  add_subdirectory(gpu)
+endif (CUDA_FOUND)
diff --git a/toolboxes/core/GadgetronCommon.h b/toolboxes/core/GadgetronCommon.h
new file mode 100644
index 0000000..fb9a8f1
--- /dev/null
+++ b/toolboxes/core/GadgetronCommon.h
@@ -0,0 +1,67 @@
+#ifndef GADGETRONCOMMON_H
+#define GADGETRONCOMMON_H
+
+#ifndef _WIN32
+
+#define GCC_VERSION (__GNUC__ * 10000           \
+                     + __GNUC_MINOR__ * 1000    \
+                     + __GNUC_PATCHLEVEL__)
+
+#if GCC_VERSION < 42000
+#pragma message ("GCC version is older than 4.2.0")
+#define GCC_OLD_FLAG 1
+#endif
+
+#else
+
+#endif // _WIN32
+
+//MACROS FOR LOGGING
+#define GADGET_MSG(message) { std::cout << message << std::endl; }
+#define GADGET_ERROR_MSG(message) { std::cout << " (" << __FILE__ << ", " << __LINE__ << ") -> error happend: " << message << std::endl; }
+#define GADGET_WARN_MSG(message) { std::cout << " (" << __FILE__ << ", " << __LINE__ << ") -> warning released: " << message << std::endl; }
+
+#define GADGET_CONDITION_MSG(con, message) { if ( con ) GADGET_MSG(message) }
+#define GADGET_CONDITION_WARN_MSG(con, message) { if ( con ) GADGET_WARN_MSG(message) }
+
+#define GADGET_THROW(msg) { GADGET_ERROR_MSG(msg); BOOST_THROW_EXCEPTION( runtime_error(msg)); }
+#define GADGET_CHECK_THROW(con) { if ( !(con) ) { GADGET_ERROR_MSG(#con); BOOST_THROW_EXCEPTION( runtime_error(#con)); } }
+
+#define GADGET_CHECK_RETURN(con, value) { if ( ! (con) ) { GADGET_ERROR_MSG("Returning '" << value << "' due to failed check: '" << #con << "'"); return (value); } }
+#define GADGET_CHECK_RETURN_FALSE(con) { if ( ! (con) ) { GADGET_ERROR_MSG("Returning false due to failed check: '" << #con << "'"); return false; } }
+
+#ifdef GADGET_DEBUG_MODE
+#define GADGET_DEBUG_CHECK_THROW(con) GADGET_CHECK_THROW(con)
+#define GADGET_DEBUG_CHECK_RETURN(con, value) GADGET_CHECK_RETURN(con, value)
+#define GADGET_DEBUG_CHECK_RETURN_FALSE(con) GADGET_CHECK_RETURN_FALSE(con)
+#else
+#define GADGET_DEBUG_CHECK_THROW(con)
+#define GADGET_DEBUG_CHECK_RETURN(con, value)
+#define GADGET_DEBUG_CHECK_RETURN_FALSE(con)
+#endif // GADGET_DEBUG_MODE
+
+// MACROS FOR TIMING
+#define GADGET_START_TIMING(timer, oper) { timer.start(#oper); } 
+#define GADGET_STOP_TIMING(timer) { timer.stop(); }
+
+#define GADGET_START_TIMING_CONDITION(timer, oper, con) { if ( con ) { timer.start(#oper); } } 
+#define GADGET_STOP_TIMING_CONDITION(timer, con) { if ( con ) { timer.stop(); } }
+
+// MACROS FOR PRINTING
+#define GADGET_OSTREAM_PRINT(os, content) { os << #content << " is " << content << std::endl; }
+
+#define GADGET_CHECK_PERFORM(con, action) { if ( con ) { action; } }
+
+// MACROS for EXPORTING
+#define GADGET_EXPORT_ARRAY(debugFolder, exporter, a, filename) { if ( !debugFolder.empty() ) { exporter.exportArray(a, debugFolder+filename); } }
+#define GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, exporter, a, filename) { if ( !debugFolder.empty() ) { exporter.exportArrayComplex(a, debugFolder+filename); } }
+
+// MACROS FOR UTILITY
+#define GT_MIN(a,b)    (((a)<(b))?(a):(b))
+#define GT_MAX(a,b)    (((a)>(b))?(a):(b))
+#define GT_ABS(a)      (((a)>=0)?(a):(-(a)))
+#define GT_SGN(a)      (((a)>=0)?(1):(-1))
+#define GT_PI          3.141592653589793238462
+#define GT_IMAGING_GEOMETRY_DELTA 0.001
+
+#endif  //GADGETRONCOMMON_H
diff --git a/toolboxes/core/GadgetronException.h b/toolboxes/core/GadgetronException.h
new file mode 100644
index 0000000..5c9789d
--- /dev/null
+++ b/toolboxes/core/GadgetronException.h
@@ -0,0 +1,38 @@
+/** \file GadgetronException.h
+    \brief An interface to the exception handling used in the Gadgetron to indicate runtime errors.
+*/
+
+#pragma once
+
+#include <iostream>
+#include <exception>
+#include <stdexcept>
+
+#include <boost/throw_exception.hpp>
+#include <boost/exception/exception.hpp>
+#include <boost/exception/info.hpp>
+#include <boost/exception/diagnostic_information.hpp>
+
+namespace Gadgetron{
+
+  class runtime_error: virtual public boost::exception, virtual public std::exception 
+  {
+  public:
+    runtime_error() : boost::exception(), std::exception(), msg(0){}
+    runtime_error(std::string _msg) : boost::exception(), std::exception(), msg(_msg.c_str()){
+    }
+    virtual const  char * what() const throw(){
+      if (msg) return msg;
+      else return std::exception::what();
+    }
+  protected:
+    const char * msg;
+  };
+  
+  class bad_alloc : public runtime_error 
+  {
+  public:
+    bad_alloc(std::string msg) : runtime_error(msg){}
+    bad_alloc() : runtime_error(){}
+  };
+}
diff --git a/toolboxes/core/GadgetronTimer.h b/toolboxes/core/GadgetronTimer.h
new file mode 100644
index 0000000..38942d9
--- /dev/null
+++ b/toolboxes/core/GadgetronTimer.h
@@ -0,0 +1,109 @@
+/** \file GadgetronTimer.h
+    \brief Generic timer class to measure runtime performance.
+*/
+
+#ifndef __GADGETRONTIMER_H
+#define __GADGETRONTIMER_H
+
+#pragma once
+
+#ifdef WIN32 
+#include <windows.h>
+#else 
+#include <sys/time.h>
+#endif
+
+#include <iostream>
+#include <string>
+
+namespace Gadgetron{
+
+  class GadgetronTimer
+  {
+  public:
+
+    GadgetronTimer() : name_("GPUTimer"), timing_in_destruction_(true)
+    {
+        pre();
+        start();
+    }
+
+    GadgetronTimer(bool timing) : name_("GPUTimer"), timing_in_destruction_(timing)
+    {
+        if ( timing_in_destruction_ )
+        {
+            pre();
+            start();
+        }
+    }
+
+    GadgetronTimer(const char* name, bool timing=false) : name_(name), timing_in_destruction_(timing) 
+    {
+        if ( timing_in_destruction_ )
+        {
+            pre();
+            start();
+        }
+    }
+
+    virtual ~GadgetronTimer() 
+    {
+        if ( timing_in_destruction_ )
+        {
+            post();
+            stop();
+        }
+    }
+
+    virtual void pre() {}
+    virtual void post() {}
+
+    virtual void start()
+    {
+#ifdef WIN32
+        QueryPerformanceFrequency(&frequency_);
+        QueryPerformanceCounter(&start_);
+#else
+        gettimeofday(&start_, NULL);
+#endif
+    }
+
+    void start(const char* name)
+    {
+        name_ = name;
+        start();
+    }
+
+    virtual void stop()
+    {
+        double time_in_us = 0.0;
+#ifdef WIN32
+        QueryPerformanceCounter(&end_);
+        time_in_us = (end_.QuadPart * (1.0e6/ frequency_.QuadPart)) - start_.QuadPart * (1.0e6 / frequency_.QuadPart);
+#else
+        gettimeofday(&end_, NULL);
+        time_in_us = ((end_.tv_sec * 1e6) + end_.tv_usec) - ((start_.tv_sec * 1e6) + start_.tv_usec);
+#endif
+        std::cout << name_ << ": " << time_in_us/1000.0 << " ms" << std::endl; std::cout.flush();
+    }
+
+    void set_timing_in_destruction(bool timing) { timing_in_destruction_ = timing; }
+
+  protected:
+
+#ifdef WIN32
+    LARGE_INTEGER frequency_;
+    LARGE_INTEGER start_;
+    LARGE_INTEGER end_;
+#else
+    timeval start_;
+    timeval end_;
+#endif
+
+    std::string name_;
+
+    bool timing_in_destruction_;
+  };
+}
+
+#endif //__GADGETRONTIMER_H
diff --git a/toolboxes/core/NDArray.h b/toolboxes/core/NDArray.h
new file mode 100644
index 0000000..38452e6
--- /dev/null
+++ b/toolboxes/core/NDArray.h
@@ -0,0 +1,657 @@
+/** \file NDArray.h
+\brief Abstract base class for all Gadgetron host and device arrays
+*/
+
+#ifndef NDARRAY_H
+#define NDARRAY_H
+#pragma once
+
+#include "GadgetronException.h"
+#include "GadgetronCommon.h"
+
+#include <new>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/cast.hpp>
+
+namespace Gadgetron{
+
+    template <typename T> class NDArray
+    {
+    public:
+
+        typedef T element_type;
+        typedef T value_type;
+
+        void* operator new (size_t bytes)
+        {
+            return ::new char[bytes];
+        }
+
+        void operator delete (void *ptr)
+        {
+            delete [] static_cast <char *> (ptr);
+        } 
+
+        void * operator new(size_t s, void * p)
+        {
+            return p;
+        }
+
+        NDArray () : data_(0), elements_(0), delete_data_on_destruct_(true)
+        {
+            dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+
+        virtual ~NDArray() {}
+
+        virtual void create(std::vector<size_t> &dimensions);
+        virtual void create(std::vector<size_t> *dimensions);
+        virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+        virtual void create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+        virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+        virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+        void squeeze();
+
+        void reshape(std::vector<size_t> *dims);
+        void reshape(boost::shared_ptr< std::vector<size_t> > dims);
+
+        bool dimensions_equal(std::vector<size_t> *d) const;
+
+        template<class S> bool dimensions_equal(const NDArray<S> *a) const
+        {
+            std::vector<size_t>* dim;
+            a->get_dimensions(dim);
+
+            if ( this->dimensions_->size() != dim->size() ) return false;
+
+            size_t NDim = this->dimensions_->size();
+            for ( size_t d=0; d<NDim; d++ )
+            {
+                if ( (*this->dimensions_)[d] != (*dim)[d] ) return false;
+            }
+
+            return true;
+        }
+
+        size_t get_number_of_dimensions() const;
+
+        size_t get_size(size_t dimension) const;
+
+        boost::shared_ptr< std::vector<size_t> > get_dimensions() const;
+        void get_dimensions(std::vector<size_t>*& dim) const;
+
+        T* get_data_ptr() const;
+
+        size_t get_number_of_elements() const;
+
+        size_t get_number_of_bytes() const;
+
+        bool delete_data_on_destruct() const;
+        void delete_data_on_destruct(bool d);
+
+        size_t calculate_offset(const std::vector<size_t>& ind) const;
+
+        size_t calculate_offset(size_t x, size_t y) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const;
+        size_t calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const;
+
+        size_t get_offset_factor(size_t dim) const;
+        boost::shared_ptr< std::vector<size_t> > get_offset_factor() const;
+
+        size_t get_offset_factor_lastdim() const;
+
+        void calculate_offset_factors(const std::vector<size_t>& dimensions);
+
+        std::vector<size_t> calculate_index( size_t offset ) const;
+        void calculate_index( size_t offset, std::vector<size_t>& index ) const;
+
+        void clear();
+
+        T& operator()( const std::vector<size_t>& ind );
+        const T& operator()( const std::vector<size_t>& ind ) const;
+
+        T& operator()( size_t x );
+        const T& operator()( size_t x ) const;
+
+        T& operator()( size_t x, size_t y );
+        const T& operator()( size_t x, size_t y ) const;
+
+        T& operator()( size_t x, size_t y, size_t z );
+        const T& operator()( size_t x, size_t y, size_t z ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const;
+
+        T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u );
+        const T& operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const;
+
+    protected:
+
+        virtual void allocate_memory() = 0;
+        virtual void deallocate_memory() = 0;
+
+    protected:
+
+        boost::shared_ptr< std::vector<size_t> > dimensions_;
+        boost::shared_ptr< std::vector<size_t> > offsetFactors_;
+        T* data_;
+        size_t elements_;
+        bool delete_data_on_destruct_;
+    };
+
+    template <typename T> 
+    inline void NDArray<T>::create(std::vector<size_t> *dimensions) 
+    {
+        if(!dimensions) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = *dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        allocate_memory();
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::create(std::vector<size_t>& dimensions) 
+    {
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        allocate_memory();
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::create(boost::shared_ptr< std::vector<size_t> > dimensions)
+    {
+        this->create(dimensions.get());
+    }
+
+    template <typename T> 
+    void NDArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) 
+    {
+        if (!dimensions) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");
+        if (!data) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");    
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = *dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+        this->elements_ = 1;
+        for (size_t i = 0; i < this->dimensions_->size(); i++){
+            this->elements_ *= (*this->dimensions_)[i];
+        }
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    void NDArray<T>::create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) 
+    {
+        if (!data) throw std::runtime_error("NDArray<T>::create(): 0x0 pointer provided");    
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = dimensions;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+        this->elements_ = 1;
+        for (size_t i = 0; i < this->dimensions_->size(); i++){
+            this->elements_ *= (*this->dimensions_)[i];
+        }
+        calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::create(boost::shared_ptr<std::vector<size_t>  > dimensions, 
+        T* data, bool delete_data_on_destruct)
+    {
+        this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::squeeze()
+    {
+        boost::shared_ptr< std::vector<size_t> > new_dimensions( new std::vector<size_t> ); 
+        for (size_t i = 0; i < dimensions_->size(); i++){
+            if ((*dimensions_)[i] != 1){
+                new_dimensions->push_back((*dimensions_)[i]);
+            }
+        }    
+        dimensions_ = new_dimensions;
+        this->calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::reshape(std::vector<size_t> *dims)
+    {
+        size_t new_elements = 1;
+        for (size_t i = 0; i < dims->size(); i++){
+            new_elements *= (*dims)[i];
+        }    
+
+        if (new_elements != elements_)
+            throw std::runtime_error("NDArray<T>::reshape : Number of elements cannot change during reshape");    
+
+        // Copy the input dimensions array
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp = *dims;
+        dimensions_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        this->calculate_offset_factors(*dimensions_);
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::reshape( boost::shared_ptr< std::vector<size_t> > dims )
+    {
+        this->reshape(dims.get());
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::dimensions_equal(std::vector<size_t> *d) const
+    {
+        if ( this->dimensions_->size() != d->size() ) return false;
+
+        size_t NDim = this->dimensions_->size();
+        for ( size_t ii=0; ii<NDim; ii++ )
+        {
+            if ( (*this->dimensions_)[ii] != (*d)[ii] ) return false;
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_number_of_dimensions() const
+    {
+        return (size_t)dimensions_->size();
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_size(size_t dimension) const
+    {
+        if (dimension >= dimensions_->size()){
+            return 1;
+        }
+        else{
+            return (*dimensions_)[dimension];
+        }
+    }
+
+    template <typename T> 
+    inline boost::shared_ptr< std::vector<size_t> > NDArray<T>::get_dimensions() const
+    {
+        // Make copy to ensure that the receiver cannot alter the array dimensions
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp=*dimensions_;
+        return boost::shared_ptr< std::vector<size_t> >(tmp); 
+    }
+
+     template <typename T> 
+    inline void NDArray<T>::get_dimensions(std::vector<size_t>*& dim) const
+    {
+        dim = dimensions_.get();
+    }
+
+    template <typename T> 
+    inline T* NDArray<T>::get_data_ptr() const
+    { 
+        return data_;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_number_of_elements() const
+    {
+        return elements_;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_number_of_bytes() const
+    {
+        return elements_*sizeof(T);
+    }
+
+    template <typename T> 
+    inline bool NDArray<T>::delete_data_on_destruct() const
+    {
+        return delete_data_on_destruct_;
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::delete_data_on_destruct(bool d)
+    {
+        delete_data_on_destruct_ = d;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(const std::vector<size_t>& ind) const
+    {
+        size_t offset = ind[0];
+        for( size_t i = 1; i < dimensions_->size(); i++ )
+            offset += ind[i] * (*offsetFactors_)[i];
+        return offset;
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==2);
+        return x + y * (*offsetFactors_)[1];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==3);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==4);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==5);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==6);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==7);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5] + a * (*offsetFactors_)[6];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==8);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5] + a * (*offsetFactors_)[6] + q * (*offsetFactors_)[7];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::calculate_offset(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u) const
+    {
+        GADGET_DEBUG_CHECK_THROW(dimensions_->size()==9);
+        return x + y * (*offsetFactors_)[1] + z * (*offsetFactors_)[2] + s * (*offsetFactors_)[3] + p * (*offsetFactors_)[4] + r * (*offsetFactors_)[5] + a * (*offsetFactors_)[6] + q * (*offsetFactors_)[7]+ u * (*offsetFactors_)[8];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_offset_factor(size_t dim) const
+    {
+        if ( dim >= (*dimensions_).size() )
+            throw std::runtime_error("NDArray<T>::get_offset_factor : index out of range");
+        return (*offsetFactors_)[dim];
+    }
+
+    template <typename T> 
+    inline size_t NDArray<T>::get_offset_factor_lastdim() const
+    {
+        if( dimensions_->size() == 0 )
+            throw std::runtime_error("NDArray<T>::get_offset_factor_lastdim : array is empty");
+
+        return get_offset_factor(dimensions_->size()-1);
+    }
+
+    template <typename T> 
+    inline boost::shared_ptr< std::vector<size_t> > NDArray<T>::get_offset_factor() const
+    {
+        std::vector<size_t> *tmp = new std::vector<size_t>;
+        *tmp=*offsetFactors_;
+        return boost::shared_ptr< std::vector<size_t> >(tmp); 
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::calculate_offset_factors(const std::vector<size_t>& dimensions)
+    {
+        if ( offsetFactors_.get() == NULL ){
+            std::vector<size_t> *tmp = new std::vector<size_t>;
+            offsetFactors_ = boost::shared_ptr< std::vector<size_t> >(tmp);
+        }
+        offsetFactors_->resize(dimensions.size());
+        for( size_t i = 0; i < dimensions.size(); i++ ){
+            size_t k = 1;
+            for( size_t j = 0; j < i; j++ )
+                k *= dimensions[j];
+            (*offsetFactors_)[i] = k;
+        }
+    }
+
+    template <typename T> 
+    inline std::vector<size_t> NDArray<T>::calculate_index( size_t offset ) const
+    {
+        if( dimensions_->size() == 0 )
+            throw std::runtime_error("NDArray<T>::calculate_index : array is empty");
+
+        std::vector<size_t> index(dimensions_->size());
+        for( long long i = dimensions_->size()-1; i>=0; i-- ){
+            index[i] = offset / (*offsetFactors_)[i];
+            offset %= (*offsetFactors_)[i];
+        }
+        return index;
+    }
+
+    template <typename T> 
+    inline void NDArray<T>::calculate_index( size_t offset, std::vector<size_t>& index ) const
+    {
+        if( dimensions_->size() == 0 )
+            throw std::runtime_error("NDArray<T>::calculate_index : array is empty");
+
+        index.resize(dimensions_->size(), 0);
+        for( long long i = dimensions_->size()-1; i>=0; i-- ){
+            index[i] = offset / (*offsetFactors_)[i];
+            offset %= (*offsetFactors_)[i];
+        }
+    }
+
+    template <typename T> 
+    void NDArray<T>::clear()
+    {
+        if ( this->delete_data_on_destruct_ ){
+            this->deallocate_memory();
+        }
+        this->data_ = 0;
+        this->elements_ = 0;
+        this->delete_data_on_destruct_ = true;
+
+        if ( !this->dimensions_ ){
+            this->dimensions_->clear();
+            this->offsetFactors_->clear();
+        }
+        else{
+            this->dimensions_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+            this->offsetFactors_ = boost::shared_ptr< std::vector<size_t> >( new std::vector<size_t> );
+        }
+    } 
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( const std::vector<size_t>& ind )
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( const std::vector<size_t>& ind ) const
+    {
+        size_t idx = this->calculate_offset(ind);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x )
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->get_number_of_elements());
+        return this->get_data_ptr()[x];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x ) const
+    {
+        GADGET_DEBUG_CHECK_THROW(x < this->get_number_of_elements());
+        return this->get_data_ptr()[x];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y )
+    {
+        size_t idx = this->calculate_offset(x, y);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y ) const
+    {
+        size_t idx = this->calculate_offset(x, y);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z )
+    {
+        size_t idx = this->calculate_offset(x, y, z);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u )
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q, u);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+
+    template <typename T> 
+    inline const T& NDArray<T>::operator()( size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a, size_t q, size_t u ) const
+    {
+        size_t idx = this->calculate_offset(x, y, z, s, p, r, a, q, u);
+        GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+        return this->get_data_ptr()[idx];
+    }
+}
+
+#endif //NDARRAY_H
diff --git a/toolboxes/core/SerializableObject.h b/toolboxes/core/SerializableObject.h
new file mode 100644
index 0000000..3e28e5e
--- /dev/null
+++ b/toolboxes/core/SerializableObject.h
@@ -0,0 +1,27 @@
+/** 
+    SerializeObject is the base class for serializable objects
+*/
+
+#pragma once
+
+#include "GadgetronCommon.h"
+#include "GadgetronException.h"
+#include "cpucore_export.h"
+
+#include <complex>
+#include <iostream>
+
+namespace Gadgetron
+{
+  class SerializableObject
+  {
+  public:
+    
+    SerializableObject() {}
+    virtual ~SerializableObject() {}
+    
+    // serialize and deserialize to/from the buffer
+    virtual bool serialize(char*& buf, size_t& len) const = 0; // Should be a void function
+    virtual bool deserialize(char* buf, size_t& len) = 0; // Should be a void function
+  };  
+}
diff --git a/toolboxes/core/complext.h b/toolboxes/core/complext.h
new file mode 100644
index 0000000..943e116
--- /dev/null
+++ b/toolboxes/core/complext.h
@@ -0,0 +1,310 @@
+/** \file complext.h
+    \brief An implementation of complex numbers that works for both the cpu and gpu.
+
+    complext.h provides an implementation of complex numbers that, unlike std::complex,
+    works on both the cpu and gpu. 
+    It follows the interface defined for std::complex.
+*/
+
+#pragma once
+
+#include "core_defines.h"
+
+#include <complex>
+#include <cmath>
+#include <iostream>
+
+namespace Gadgetron{
+
+  using std::abs; // workaround for nvcc
+
+  /** 
+   * \class complext
+   * \brief An implementation of complex numbers that works for both the cpu and gpu.
+   */
+  template< class T > class complext
+  {
+  public:
+
+    T vec[2];
+
+    __inline__ __host__ __device__  T real() const 
+    {
+      return vec[0];
+    }
+
+    __inline__ __host__ __device__  T imag() const 
+    {
+      return vec[1];
+    }
+
+    __inline__ __host__ __device__  complext() {}
+
+    __inline__ __host__ __device__  complext(T real, T imag){
+      vec[0]=real;
+      vec[1]=imag;
+    }
+
+    __inline__ __host__ __device__  complext(const complext<T>& tmp){
+      vec[0] = tmp.vec[0];
+      vec[1] = tmp.vec[1];
+    }
+
+    __inline__ __host__ __device__  complext(const std::complex<T>& tmp){
+      vec[0] = tmp.real();
+      vec[1] = tmp.imag();
+		}
+    __inline__ __host__ __device__  complext(const T r){
+      vec[0] = r;
+      vec[1] = 0;
+    }
+
+    __inline__ __host__ __device__ void conj(){
+      vec[1] = -vec[1];
+    }
+
+    __inline__ __host__ __device__  complext<T> operator+(const complext<T>& other){
+      return complext<T>(vec[0]+other.vec[0],vec[1]+other.vec[1]);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator-(const complext<T>& other){
+      return complext<T>(vec[0]-other.vec[0],vec[1]-other.vec[1]);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator-(){
+      return complext<T>(-vec[0],-vec[1]);
+    }
+
+    __inline__ __host__ __device__  void operator-=(const complext<T>& other){
+      vec[0] -= other.vec[0];
+      vec[1] -= other.vec[1];
+    }
+
+    __inline__ __host__ __device__  void operator+=(const complext<T>& other){
+      vec[0] += other.vec[0];
+      vec[1] += other.vec[1];
+    }
+
+    __inline__ __host__ __device__  complext<T> operator*(const T& other){
+      return complext<T>(vec[0]*other,vec[1]*other);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator*(const complext<T>& other){
+      return complext<T>(vec[0]*other.vec[0]-vec[1]*other.vec[1],vec[0]*other.vec[1]+vec[1]*other.vec[0]);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator/(const T& other){
+      return complext<T>(vec[0]/other,vec[1]/other);
+    }
+
+    __inline__ __host__ __device__  complext<T> operator/(const complext<T>& other){
+      T cd = other.vec[0]*other.vec[0]+other.vec[1]*other.vec[1];
+      return complext<T>((vec[0]*other.vec[0]+vec[1]*other.vec[1])/cd ,(vec[1]*other.vec[0]-vec[0]*other.vec[1])/cd);
+    }
+
+    __inline__ __host__ __device__  void operator*=(const T& other){
+      vec[0] *= other;
+      vec[1] *= other;
+    }
+
+    __inline__ __host__ __device__  void operator*=(const complext<T>& other){
+      complext<T> tmp = *this;
+      vec[0] = tmp.vec[0]*other.vec[0]-tmp.vec[1]*other.vec[1];
+      vec[1] = tmp.vec[0]*other.vec[1]+tmp.vec[1]*other.vec[0];
+    }
+
+    __inline__ __host__ __device__  void operator/=(const T& other){
+      vec[0] /= other;
+      vec[1] /= other;
+    }
+
+    __inline__ __host__ __device__  void operator/=(const complext<T>& other){
+      complext<T> tmp = (*this)/other;
+      vec[0]=tmp.vec[0];
+      vec[1]=tmp.vec[1];
+    }
+
+    __inline__ __host__ __device__  bool operator==(const complext<T>& comp2){
+
+      return vec[0]==comp2.vec[0] && vec[1]==comp2.vec[1];
+    }
+    __inline__ __host__ __device__  bool operator!=(const complext<T>& comp2){
+
+      return not(*this==comp2);
+    }
+  };
+
+  template <typename T> 
+  inline std::ostream & operator<< (std::ostream & os, const complext<T>& a )
+  {
+    os << a.real() << a.imag() << "i";
+    return os;
+  }
+
+  template <> 
+  inline std::ostream & operator<< (std::ostream & os, const complext<float>& a )
+  {
+    os << a.real() << a.imag() << "i";
+    return os;
+  }
+
+  template <> 
+  inline std::ostream & operator<< (std::ostream & os, const complext<double>& a )
+  {
+    os << a.real() << a.imag() << "i";
+    return os;
+  }
+
+  typedef complext<float> float_complext;
+  typedef complext<double> double_complext;
+
+  template <class T> struct realType {};
+  template<> struct realType<float_complext> {typedef float Type; };
+  template<> struct realType<double_complext> {typedef double Type; };
+  template<> struct realType<float> {typedef float Type; };
+  template<> struct realType<double> {typedef double Type; };
+  template<> struct realType<std::complex<float> > {typedef float Type; };
+  template<> struct realType<std::complex<double> > {typedef double Type; };
+
+  template<class T> struct stdType {typedef T Type;};
+  template<> struct stdType<double_complext> {typedef std::complex<double> Type;};
+  template<> struct stdType<float_complext> {typedef std::complex<float> Type;};
+  template<> struct stdType<std::complex<double> > {typedef std::complex<double> Type;};
+  template<> struct stdType<std::complex<float> > {typedef std::complex<float> Type;};
+  template<> struct stdType<double> {typedef double Type;};
+  template<> struct stdType<float> {typedef float Type;};
+
+  __inline__ __host__ __device__ double sgn(double x){
+    return (double(0) < x) - (x < double(0));
+  }
+  __inline__ __host__ __device__ float sgn(float x){
+    return (float)((float(0) < x) - (x < float(0)));
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> sgn(complext<T> x){
+    if (norm(x) <= T(0)) return complext<T>(0);
+    return (x/abs(x));
+  }
+  template<class T>  __inline__ __host__ __device__ complext<T> polar(const T& rho, const T& theta = 0){
+    return complext<T>(rho*std::cos(theta),rho*std::sin(theta));
+  }
+
+  template<class T>  __inline__ __host__ __device__ complext<T> sqrt(complext<T> x){
+    T r = abs(x);
+    return complext<T>(::sqrt((r+x.real())/2),sgn(x.imag())*::sqrt((r-x.real())/2));
+  }
+
+  template<class T> __inline__ __host__ __device__ T abs(complext<T> comp){
+    return ::sqrt(comp.vec[0]*comp.vec[0]+comp.vec[1]*comp.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> sin(complext<T> comp){
+    return complext<T>(sin(comp.vec[0])*std::cosh(comp.vec[1]),std::cos(comp.vec[0])*std::sinh(comp.vec[1]));
+  }
+
+  template<class T> __inline__ __host__ __device__ complext<T> cos(complext<T> comp){
+    return complext<T>(cos(comp.vec[0])*cosh(comp.vec[1]),-sin(comp.vec[0])*sinh(comp.vec[1]));
+  }
+
+  template<class T> __inline__ __host__ __device__ T imag(complext<T> comp){
+    return comp.vec[1];
+  }
+
+  __inline__ __host__ __device__ double real(double r){
+    return r;
+  }
+
+  __inline__ __host__ __device__ double imag(double r){
+    return 0.0;
+  }
+
+  __inline__ __host__ __device__ float real(float r){
+    return r;
+  }
+
+  __inline__ __host__ __device__ float imag(float r){
+    return 0.0f;
+  }
+
+  template<class T> __inline__ __host__ __device__ T real(complext<T> comp){
+    return comp.vec[0];
+  }
+
+  template<class T> __inline__ __host__ __device__ T arg(complext<T> comp){
+    return std::atan2(comp.vec[1],comp.vec[0]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator*(const T& r,const complext<T>& z){
+    return complext<T>(z.vec[0]*r,z.vec[1]*r);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator*(const complext<T>& z,const T& r){
+    return complext<T>(z.vec[0]*r,z.vec[1]*r);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator+(const complext<T>& z1,const complext<T>& z2){
+    return complext<T>(z1.vec[0]+z2.vec[0],z1.vec[1]+z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator+(const complext<T>& z1,const T& r){
+    return complext<T>(z1.vec[0]+r, z1.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator+(const T& r,const complext<T>& z1){
+    return complext<T>(z1.vec[0]+r, z1.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator-(const complext<T>& z1,const complext<T>& z2){
+    return complext<T>(z1.vec[0]-z2.vec[0],z1.vec[1]-z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator-(const T& r,const complext<T>& z2){
+    return complext<T>(r-z2.vec[0],-z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator-(const complext<T>& z2,const T& r){
+    return complext<T>(z2.vec[0]-r,z2.vec[1]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator*(const complext<T>& z1,const complext<T>& z2){
+    return complext<T>(z1.vec[0]*z2.vec[0]-z1.vec[1]*z2.vec[1],z1.vec[0]*z2.vec[1]+z1.vec[1]*z2.vec[0]);
+  }
+
+  template<class T> __inline__ __host__ __device__  complext<T> operator/(const complext<T>& z1,const complext<T>& z2){
+    T cd = z2.vec[0]*z2.vec[0]+z2.vec[1]*z2.vec[1];
+    return complext<T>((z1.vec[0]*z2.vec[0]+z1.vec[1]*z2.vec[1])/cd ,(z1.vec[1]*z2.vec[0]-z1.vec[0]*z2.vec[1])/cd);
+  }
+
+  template<class REAL, class T> __inline__ __host__ __device__  complext<T> operator/(const REAL& real, const complext<T>& comp){
+    T cd = comp.vec[0]*comp.vec[0]+comp.vec[1]*comp.vec[1];
+    return complext<T>(comp.vec[0]*real/cd,-real*comp.vec[1]/cd);
+  }
+
+  template<class REAL, class T> __inline__ __host__ __device__  complext<T> operator/(const complext<T>& comp,const REAL& real){
+    return complext<T>(comp.vec[0]/real,comp.vec[1]/real);
+  }
+
+  __inline__ __host__ __device__ float norm(const float& r){
+    return r*r;
+  }
+
+  __inline__ __host__ __device__ double norm(const double& r){
+    return r*r;
+  }
+
+  template<class T> __inline__ __host__ __device__ T norm(const complext<T>& z){
+    return z.vec[0]*z.vec[0]+z.vec[1]*z.vec[1];
+  }
+
+  __inline__ __host__ __device__ double conj(const double& r){ 
+    return r; }
+
+  __inline__ __host__ __device__ float conj(const float& r) { 
+    return r; }
+  
+  template<class T> __inline__ __host__ __device__ complext<T> conj( const complext<T>& z ){
+    complext<T> res=z;
+    res.conj();
+    return res;
+  }
+}
diff --git a/toolboxes/core/core_defines.h.in b/toolboxes/core/core_defines.h.in
new file mode 100644
index 0000000..fc924c0
--- /dev/null
+++ b/toolboxes/core/core_defines.h.in
@@ -0,0 +1,28 @@
+/** \file core_defines.h
+    \brief Autogenerated header providing definitions of __host__, __device__, and __inline__ for systems on which Cuda is not installed.
+*/
+
+#pragma once
+
+// Notice:
+// -------
+//
+// The header cpucore_defines.h is autogenerated 
+// by cmake from cpucore_defines.h.in
+//
+
+// Definition of Cuda availability passed to C++
+//
+
+#define GADGETRON_CUDA_IS_AVAILABLE @GADGETRON_CUDA_FOUND_BOOL@
+
+// Used Cuda host definitions if availble.
+// Otherwise we leave them empty (as no device code is compiled anyway).
+
+#if GADGETRON_CUDA_IS_AVAILABLE
+#include "host_defines.h"
+#else
+#define __host__
+#define __device__
+#define __inline__ inline
+#endif
diff --git a/toolboxes/core/cpu/CMakeLists.txt b/toolboxes/core/cpu/CMakeLists.txt
new file mode 100644
index 0000000..cfc2f19
--- /dev/null
+++ b/toolboxes/core/cpu/CMakeLists.txt
@@ -0,0 +1,84 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUCORE__)
+endif (WIN32)
+
+include_directories(
+  ${FFTW3_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  )
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+if (MKL_FOUND)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+add_library(cpucore ${LIBTYPE} 
+  ../NDArray.h
+  ../complext.h
+  ../GadgetronException.h
+  ../GadgetronCommon.h
+  ../GadgetronTimer.h
+  ../SerializableObject.h
+  cpucore_export.h 
+  hoNDArray.h
+  hoNDArray.hxx
+  hoNDArray_utils.h
+  hoNDArray_fileio.h
+  hoNDFFT.h
+  ho2DArray.h
+  ho2DArray.hxx
+  ho3DArray.h
+  ho3DArray.hxx
+  ho4DArray.h
+  ho4DArray.hxx
+  ho5DArray.h
+  ho5DArray.hxx
+  ho6DArray.h
+  ho6DArray.hxx
+  ho7DArray.h
+  ho7DArray.hxx 
+  hoMatrix.h
+  hoNDFFT.cpp
+  )
+
+target_link_libraries(cpucore 
+  ${FFTW3_LIBRARIES} 
+  ${Boost_LIBRARIES} 
+  ${MKL_LIBRARIES} 
+  )
+
+install(TARGETS cpucore DESTINATION lib)
+
+install(FILES
+  cpucore_export.h 
+  hoNDArray.h
+  hoNDArray.hxx
+  hoNDArray_utils.h
+  hoNDArray_fileio.h
+  hoNDFFT.h
+  ho2DArray.h
+  ho2DArray.hxx
+  ho3DArray.h
+  ho3DArray.hxx
+  ho4DArray.h
+  ho4DArray.hxx
+  ho5DArray.h
+  ho5DArray.hxx
+  ho6DArray.h
+  ho6DArray.hxx
+  ho7DArray.h
+  ho7DArray.hxx
+  hoMatrix.h
+  hoMatrix.cpp
+  DESTINATION include)
+
+if (ARMADILLO_FOUND)
+    add_subdirectory(arma_math)
+endif (ARMADILLO_FOUND)
+
+add_subdirectory(hostutils)
diff --git a/toolboxes/core/cpu/arma_math/CMakeLists.txt b/toolboxes/core/cpu/arma_math/CMakeLists.txt
new file mode 100644
index 0000000..e5b2873
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/CMakeLists.txt
@@ -0,0 +1,48 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUCORE_MATH__)
+endif (WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${ARMADILLO_INCLUDE_DIRS}
+  )
+
+if (MKL_FOUND)
+  INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+  LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+add_library(cpucore_math ${LIBTYPE} 
+  hoNDArray_operators.h
+  hoNDArray_operators.cpp
+  hoNDArray_elemwise.h
+  hoNDArray_elemwise.cpp
+  hoNDArray_blas.h
+  hoNDArray_blas.cpp
+  hoNDArray_reductions.cpp
+  )
+
+if (MKL_FOUND)
+  target_link_libraries(cpucore_math 
+    cpucore
+    ${ARMADILLO_LIBRARIES}
+    ${MKL_LIBRARIES}
+    )
+else (MKL_FOUND)
+  target_link_libraries(cpucore_math 
+    cpucore
+    ${ARMADILLO_LIBRARIES}
+    )
+endif (MKL_FOUND)
+
+install(TARGETS cpucore_math DESTINATION lib)
+
+install(FILES 	
+  cpucore_math_export.h
+  hoArmadillo.h
+  hoNDArray_operators.h
+  hoNDArray_elemwise.h
+  hoNDArray_blas.h
+  hoNDArray_reductions.h
+  hoNDArray_math.h
+  DESTINATION include)
diff --git a/toolboxes/core/cpu/arma_math/cpucore_math_export.h b/toolboxes/core/cpu/arma_math/cpucore_math_export.h
new file mode 100644
index 0000000..c7e8dd4
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/cpucore_math_export.h
@@ -0,0 +1,22 @@
+/** \file cpucore_math_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef CPUCORE_MATH_EXPORT_H_
+#define CPUCORE_MATH_EXPORT_H_
+
+#if defined (WIN32)
+    #ifdef BUILD_TOOLBOX_STATIC
+        #define EXPORTCPUCOREMATH
+    #else
+        #if defined (__BUILD_GADGETRON_CPUCORE_MATH__) || defined (cpucore_math_EXPORTS)
+            #define EXPORTCPUCOREMATH __declspec(dllexport)
+        #else
+            #define EXPORTCPUCOREMATH __declspec(dllimport)
+        #endif
+    #endif
+#else
+#define EXPORTCPUCOREMATH
+#endif
+
+#endif /* CPUCORE_MATH_EXPORT_H_ */
diff --git a/toolboxes/core/cpu/arma_math/hoArmadillo.h b/toolboxes/core/cpu/arma_math/hoArmadillo.h
new file mode 100644
index 0000000..abb481c
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoArmadillo.h
@@ -0,0 +1,84 @@
+#pragma once
+#define ARMA_64BIT_WORD
+#include "hoNDArray.h"
+#include <armadillo>
+
+/** \file hoArmadillo.h
+\brief Utilities to create an Armadillo matrix or column vector from an hoNDArray.
+
+Utilities to create an Armadillo matrix or column vector from an hoNDArray.
+A helper function that creates an hoNDArray from an Armadillo matrix or vector is deliberatly omitted:
+The reccomended approach to using Armadillo's functionality and providing an hoNDArray of the result is 
+1) create an hoNDArray to hold the result, 
+2) convert this array to an Armadillo matrix or vector using the utilities provided in this header,
+3) assign the desired Armadillo computation to this array.
+This approach ensures that the Gadgetron -- and not Armadillo -- is responsible for subsequent memory handling.
+We refer to hoNDArray_math.h for some specific examples on how to use this Armadillo interface.
+*/
+
+namespace Gadgetron{
+
+  /**
+   * @brief Creates an Armadillo matrix from a two-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> arma::Mat<typename stdType<T>::Type> as_arma_matrix( hoNDArray<T> *x )
+  {
+    if( x->get_number_of_dimensions() != 2 )
+      throw std::runtime_error("Wrong number of dimensions. Cannot convert hoNDArray to matrix");
+    return arma::Mat<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_size(0), x->get_size(1), false, true );
+  }
+
+  /**
+   * @brief Creates an Armadillo matrix from a two-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> const arma::Mat<typename stdType<T>::Type> as_arma_matrix( const hoNDArray<T> *x )
+  {
+    if( x->get_number_of_dimensions() != 2 )
+      throw std::runtime_error("Wrong number of dimensions. Cannot convert hoNDArray to matrix");
+    return arma::Mat<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_size(0), x->get_size(1), false, true );
+  }
+  
+  /**
+   * @brief Creates an Armadillo column vector from an arbitrary-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> arma::Col<typename stdType<T>::Type > as_arma_col( hoNDArray<T> *x )
+  {
+    return arma::Col<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+  }
+
+  /**
+   * @brief Creates an Armadillo column vector from an arbitrary-dimensional hoNDArray.
+   * @param[in] x Input array.
+   * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+   */
+  template<class T> const arma::Col<typename stdType<T>::Type > as_arma_col( const hoNDArray<T> *x )
+  {
+    return arma::Col<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+  }
+
+  /**
+     * @brief Creates an Armadillo row vector from an arbitrary-dimensional hoNDArray.
+     * @param[in] x Input array.
+     * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+     */
+    template<class T> arma::Row<typename stdType<T>::Type > as_arma_row( hoNDArray<T> *x )
+    {
+      return arma::Row<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+    }
+
+    /**
+     * @brief Creates an Armadillo row vector from an arbitrary-dimensional hoNDArray.
+     * @param[in] x Input array.
+     * @return An Armadillo array mapped to the data pointer of the hoNDArray.
+     */
+    template<class T> const arma::Row<typename stdType<T>::Type > as_arma_row( const hoNDArray<T> *x )
+    {
+      return arma::Row<typename stdType<T>::Type>( (typename stdType<T>::Type*) x->get_data_ptr(), x->get_number_of_elements(), false, true );
+    }
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_blas.cpp b/toolboxes/core/cpu/arma_math/hoNDArray_blas.cpp
new file mode 100644
index 0000000..47c793b
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_blas.cpp
@@ -0,0 +1,648 @@
+#include "hoNDArray_blas.h"
+
+namespace Gadgetron{
+
+    template<class T> T dot( hoNDArray<T> *x, hoNDArray<T> *y, bool cc )
+    {
+        if( x == 0x0 || y == 0x0 )
+            throw std::runtime_error("Gadgetron::dot(): Invalid input array");
+
+        if( x->get_number_of_elements() != y->get_number_of_elements() )
+            throw std::runtime_error("Gadgetron::dot(): Array sizes mismatch");
+
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        arma::Col<typename stdType<T>::Type> yM = as_arma_col(y);
+        typename stdType<T>::Type res = (cc) ? arma::cdot(xM,yM) : arma::dot(xM,yM);
+        return *((T*)(&res));
+    }
+
+    template<class T> typename realType<T>::Type asum( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        return realT(arma::norm(xM,1));
+    }
+
+    template<class T> T asum( hoNDArray< std::complex<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+
+        return arma::norm(arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x))),1);
+    }
+
+    template<class T> T asum( hoNDArray< complext<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+
+        return arma::norm(arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x))),1);
+    }
+
+    template<class T> typename realType<T>::Type nrm2( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::nrm2(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        return realT(arma::norm(xM,2));
+    }
+
+    template<class T> typename realType<T>::Type nrm1( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::nrm1(): Invalid input array"));
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<typename stdType<T>::Type> xM = as_arma_col(x);
+        return realT(arma::norm(xM,1));
+    }
+
+    template<class T> size_t amin( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<realT> xM = arma::abs(as_arma_col(x));
+	arma::uword idx;
+        realT min = xM.min(idx);
+        return idx;
+    }
+
+    template<class T> size_t amin( hoNDArray< std::complex<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+	arma::uword idx;
+        T min = xM.min(idx);
+        return idx;
+    }
+
+    template<class T> size_t amin( hoNDArray< complext<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+	arma::uword idx;
+        T min = xM.min(idx);
+        return idx;
+    }
+
+    template<class T> size_t amax( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+        typedef typename realType<T>::Type realT;
+        arma::Col<realT> xM = arma::abs(as_arma_col(x));
+	arma::uword idx;
+        realT max = xM.max(idx);
+        return idx;
+    }
+
+    template<class T> size_t amax( hoNDArray< std::complex<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+	arma::uword idx;
+        T max = xM.max(idx);
+        return idx;
+    }
+
+    template<class T> size_t amax( hoNDArray< complext<T> > *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+        arma::Col<T> xM = arma::abs(real(as_arma_col(x)))+arma::abs(imag(as_arma_col(x)));
+	arma::uword idx;
+        T max = xM.max(idx);
+        return idx;
+    }
+
+    template<class T> void axpy( T a, hoNDArray<T> *x, hoNDArray<T> *y )
+    {
+        if( x == 0x0 || y == 0x0 )
+            throw std::runtime_error("Gadgetron::axpy(): Invalid input array");
+
+        if( x->get_number_of_elements() != y->get_number_of_elements() )
+            throw std::runtime_error("Gadgetron::axpy(): Array sizes mismatch");
+
+        typedef typename stdType<T>::Type stdT;
+        arma::Col<stdT> xM = as_arma_col(x);
+        arma::Col<stdT> yM = as_arma_col(y);
+        stdT a2 = *((stdT*)(&a));
+        yM += (a2*xM);
+    }
+
+    #ifdef USE_MKL
+
+    template<> float nrm1( hoNDArray<float> *x )
+    {
+        if ( x == NULL ) return 0;
+        MKL_INT N = x->get_number_of_elements();
+        MKL_INT incx = 1;
+        return(sasum(&N, x->begin(), &incx));
+    }
+
+    template<> double nrm1( hoNDArray<double> *x )
+    {
+        if ( x == NULL ) return 0;
+        MKL_INT N = x->get_number_of_elements();
+        MKL_INT incx = 1;
+        return(dasum(&N, x->begin(), &incx));
+    }
+
+    // BLAS dotc and dotu
+    // res = conj(x) dot y
+    GT_Complex8 dotc(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y)
+    {
+        if ( x.get_number_of_elements() != y.get_number_of_elements() )
+        {
+            GADGET_ERROR_MSG("dotc(x, y), inputs have differnet length ...");
+            return 0.0;
+        }
+
+        MKL_INT N = x.get_number_of_elements();
+        MKL_INT incx(1), incy(1);
+        GT_Complex8 r;
+        cdotc(reinterpret_cast<MKL_Complex8*>(&r), &N, reinterpret_cast<const MKL_Complex8*>(x.begin()), &incx, reinterpret_cast<const MKL_Complex8*>(y.begin()), &incy);
+        return r;
+    }
+
+    GT_Complex16 dotc(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y)
+    {
+        if ( x.get_number_of_elements() != y.get_number_of_elements() )
+        {
+            GADGET_ERROR_MSG("dotc(x, y), inputs have differnet length ...");
+            return 0;
+        }
+
+        MKL_INT N = x.get_number_of_elements();
+        MKL_INT incx(1), incy(1);
+        GT_Complex16 r;
+        zdotc(reinterpret_cast<MKL_Complex16*>(&r), &N, reinterpret_cast<const MKL_Complex16*>(x.begin()), &incx, reinterpret_cast<const MKL_Complex16*>(y.begin()), &incy);
+        return r;
+    }
+
+    // res = x dot y
+    GT_Complex8 dotu(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y)
+    {
+        if ( x.get_number_of_elements() != y.get_number_of_elements() )
+        {
+            GADGET_ERROR_MSG("dotu(x, y), inputs have differnet length ...");
+            return 0;
+        }
+
+        MKL_INT N = x.get_number_of_elements();
+        MKL_INT incx(1), incy(1);
+        GT_Complex8 r;
+        cdotu(reinterpret_cast<MKL_Complex8*>(&r), &N, reinterpret_cast<const MKL_Complex8*>(x.begin()), &incx, reinterpret_cast<const MKL_Complex8*>(y.begin()), &incy);
+        return r;
+    }
+
+    GT_Complex16 dotu(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y)
+    {
+        if ( x.get_number_of_elements() != y.get_number_of_elements() )
+        {
+            GADGET_ERROR_MSG("dotu(x, y), inputs have differnet length ...");
+            return 0;
+        }
+
+        MKL_INT N = x.get_number_of_elements();
+        MKL_INT incx(1), incy(1);
+        GT_Complex16 r;
+        zdotu(reinterpret_cast<MKL_Complex16*>(&r), &N, reinterpret_cast<const MKL_Complex16*>(x.begin()), &incx, reinterpret_cast<const MKL_Complex16*>(y.begin()), &incy);
+        return r;
+    }
+
+    // other variants for axpy
+    // r = a*x+y
+    bool axpy(float a, const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+
+            if ( r.get_number_of_elements() != x.get_number_of_elements() )
+            {
+                r = y;
+            }
+            else
+            {
+                if ( &r != &y )
+                {
+                    memcpy(r.begin(), y.begin(), r.get_number_of_bytes());
+                }
+            }
+
+            MKL_INT N = (MKL_INT)(x.get_number_of_elements());
+            const MKL_INT incX(1), incY(1);
+
+            cblas_saxpy (N, a, x.begin(), incX, r.begin(), incY);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in axpy(float a, const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool axpy(double a, const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+
+            if ( r.get_number_of_elements() != x.get_number_of_elements() )
+            {
+                r = y;
+            }
+            else
+            {
+                if ( &r != &y )
+                {
+                    memcpy(r.begin(), y.begin(), r.get_number_of_bytes());
+                }
+            }
+
+            MKL_INT N = (MKL_INT)(x.get_number_of_elements());
+            const MKL_INT incX(1), incY(1);
+
+            cblas_daxpy (N, a, x.begin(), incX, r.begin(), incY);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in axpy(double a, const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool axpy(const GT_Complex8& a, const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+
+            if ( r.get_number_of_elements() != x.get_number_of_elements() )
+            {
+                r = y;
+            }
+            else
+            {
+                if ( &r != &y )
+                {
+                    memcpy(r.begin(), y.begin(), r.get_number_of_bytes());
+                }
+            }
+
+            MKL_INT N = (MKL_INT)(x.get_number_of_elements());
+            const MKL_INT incX(1), incY(1);
+
+            cblas_caxpy (N, &a, x.begin(), incX, r.begin(), incY);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in axpy(const GT_Complex8& a, const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool axpy(const GT_Complex16& a, const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+
+            if ( r.get_number_of_elements() != x.get_number_of_elements() )
+            {
+                r = y;
+            }
+            else
+            {
+                if ( &r != &y )
+                {
+                    memcpy(r.begin(), y.begin(), r.get_number_of_bytes());
+                }
+            }
+
+            MKL_INT N = (MKL_INT)(x.get_number_of_elements());
+            const MKL_INT incX(1), incY(1);
+
+            cblas_zaxpy (N, &a, x.begin(), incX, r.begin(), incY);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in axpy(const GT_Complex16& a, const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // vector-scalar product
+    // r = a*x
+    bool scal(float a, hoNDArray<float>& x)
+    {
+        try
+        {
+            cblas_sscal ((MKL_INT)(x.get_number_of_elements()), a, x.begin(), 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(float a, hoNDArray<float>& x) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(double a, hoNDArray<double>& x)
+    {
+        try
+        {
+            cblas_dscal ((MKL_INT)(x.get_number_of_elements()), a, x.begin(), 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(double a, hoNDArray<double>& x) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(float a, hoNDArray<GT_Complex8>& x)
+    {
+        try
+        {
+            GT_Complex8 alpha = GT_Complex8(a);
+            cblas_cscal (x.get_number_of_elements(), &alpha, x.begin(), 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(float a, hoNDArray<GT_Complex8>& x) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(double a, hoNDArray<GT_Complex16>& x)
+    {
+        try
+        {
+            GT_Complex16 alpha = GT_Complex16(a);
+            cblas_zscal (x.get_number_of_elements(), &alpha, x.begin(), 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(double a, hoNDArray<GT_Complex16>& x) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(GT_Complex8 a, hoNDArray<GT_Complex8>& x)
+    {
+        try
+        {
+            cblas_cscal (x.get_number_of_elements(), &a, x.begin(), 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(GT_Complex8 a, hoNDArray<GT_Complex8>& x) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(GT_Complex16 a, hoNDArray<GT_Complex16>& x)
+    {
+        try
+        {
+            cblas_zscal (x.get_number_of_elements(), &a, x.begin(), 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(GT_Complex16 a, hoNDArray<GT_Complex16>& x) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // -----------------------
+
+    bool scal(float a, float*x, long long N)
+    {
+        try
+        {
+            cblas_sscal ((MKL_INT)(N), a, x, 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(float a, float*x, long long N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(double a, double*x, long long N)
+    {
+        try
+        {
+            cblas_dscal ((MKL_INT)(N), a, x, 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(double a, double*x, long long N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(float a, GT_Complex8*x, long long N)
+    {
+        try
+        {
+            GT_Complex8 alpha = GT_Complex8(a);
+            cblas_cscal (N, &alpha, x, 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(float a, GT_Complex8*x, long long N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(double a, GT_Complex16*x, long long N)
+    {
+        try
+        {
+            GT_Complex16 alpha = GT_Complex16(a);
+            cblas_zscal (N, &alpha, x, 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(double a, GT_Complex16*x, long long N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(GT_Complex8 a, GT_Complex8*x, long long N)
+    {
+        try
+        {
+            cblas_cscal (N, &a, x, 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(GT_Complex8 a, GT_Complex8*x, long long N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool scal(GT_Complex16 a, GT_Complex16*x, long long N)
+    {
+        try
+        {
+            cblas_zscal (N, &a, x, 1);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in scal(GT_Complex16 a, GT_Complex16*x, long long N) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // sort the vector
+    // isascending: true for ascending and false for descending
+    bool sort(const hoNDArray<float>& x, hoNDArray<float>& r, bool isascending)
+    {
+        if ( &r != &x )
+        {
+            if ( r.get_number_of_elements()!=x.get_number_of_elements())
+            {
+                r = x;
+            }
+            else
+            {
+                memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+            }
+        }
+
+        if ( isascending )
+        {
+            GADGET_CHECK_RETURN_FALSE(LAPACKE_slasrt('I', r.get_number_of_elements(), r.begin())==0);
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(LAPACKE_slasrt('D', r.get_number_of_elements(), r.begin())==0);
+        }
+
+        return true;
+    }
+
+    bool sort(const hoNDArray<double>& x, hoNDArray<double>& r, bool isascending)
+    {
+        if ( &r != &x )
+        {
+            if ( r.get_number_of_elements()!=x.get_number_of_elements())
+            {
+                r = x;
+            }
+            else
+            {
+                memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+            }
+        }
+
+        if ( isascending )
+        {
+            GADGET_CHECK_RETURN_FALSE(LAPACKE_dlasrt('I', r.get_number_of_elements(), r.begin())==0);
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(LAPACKE_dlasrt('D', r.get_number_of_elements(), r.begin())==0);
+        }
+
+        return true;
+    }
+
+    #endif // USE_MKL
+
+    //
+    // Instantiation
+    //
+
+    template EXPORTCPUCOREMATH float dot<float>( hoNDArray<float>*, hoNDArray<float>*, bool );
+    template EXPORTCPUCOREMATH float asum<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH float nrm2<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH size_t amin<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH size_t amax<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void axpy<float>( float, hoNDArray<float>*, hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH double dot<double>( hoNDArray<double>*, hoNDArray<double>*, bool );
+    template EXPORTCPUCOREMATH double asum<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH double nrm2<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH size_t amin<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH size_t amax<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void axpy<double>( double, hoNDArray<double>*, hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH std::complex<float> dot< std::complex<float> >( hoNDArray< std::complex<float> >*, hoNDArray< std::complex<float> >*, bool );
+    template EXPORTCPUCOREMATH float asum<float>( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH float nrm2< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH size_t amin<float>( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH size_t amax<float>( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void axpy< std::complex<float> >( std::complex<float> , hoNDArray< std::complex<float> >*, hoNDArray< std::complex<float> >* );
+
+    template EXPORTCPUCOREMATH std::complex<double> dot< std::complex<double> >( hoNDArray< std::complex<double> >*, hoNDArray< std::complex<double> >*, bool );
+    template EXPORTCPUCOREMATH double asum<double>( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH double nrm2< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH size_t amin<double>( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH size_t amax<double>( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void axpy< std::complex<double> >( std::complex<double> , hoNDArray< std::complex<double> >*, hoNDArray< std::complex<double> >* );
+
+    template EXPORTCPUCOREMATH complext<float> dot< complext<float> >( hoNDArray< complext<float> >*, hoNDArray< complext<float> >*, bool );
+    template EXPORTCPUCOREMATH float asum<float>( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH float nrm2< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH size_t amin<float>( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH size_t amax<float>( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void axpy< complext<float> >( complext<float> , hoNDArray< complext<float> >*, hoNDArray< complext<float> >* );
+
+    template EXPORTCPUCOREMATH complext<double> dot< complext<double> >( hoNDArray< complext<double> >*, hoNDArray< complext<double> >*, bool );
+    template EXPORTCPUCOREMATH double asum<double>( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH double nrm2< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH size_t amin<double>( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH size_t amax<double>( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void axpy< complext<double> >( complext<double> , hoNDArray< complext<double> >*, hoNDArray< complext<double> >* );
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_blas.h b/toolboxes/core/cpu/arma_math/hoNDArray_blas.h
new file mode 100644
index 0000000..d82a12c
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_blas.h
@@ -0,0 +1,181 @@
+/** \file hoNDArray_blas.h
+    \brief BLAS level-1 functions on the hoNDArray class.
+    
+    hoNDArray_blas.h provides BLAS level-1 functions on the hoNDArray class.
+    The hoNDArray is temporarily reshaped to a column vector for the respective operations.
+    The implementation is based on Armadillo.
+    This code is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double, std::complex<float>, std::complex<double>, 
+    Gadgetron::complext<float>, and Gadgetron::complext<double>.
+    There are currently no amin and amax functions instantiated for complex types 
+    since Armadillo lacks an obvious method to compute the element-wise l1-norm.
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoArmadillo.h"
+#include "complext.h"
+#include "cpucore_math_export.h"
+#include "GadgetronCommon.h"
+#include <complex>
+
+#ifdef USE_MKL
+#include "mkl.h"
+#endif // USE_MKL
+
+#ifdef GT_Complex8
+#undef GT_Complex8
+#endif // GT_Complex8
+typedef std::complex<float> GT_Complex8;
+
+#ifdef GT_Complex16
+#undef GT_Complex16
+#endif // GT_Complex16
+typedef std::complex<double> GT_Complex16;
+
+namespace Gadgetron{
+
+  /**
+   * @brief Calculates the dot product of two arrays (as vectors).
+   * @param[in] x Array 1. For complex arrays the complex conjugate of x is used.
+   * @param[in] y Array 2.
+   * @param[in] cc Specifies whether to use the complex conjugate of x (when applicable).
+   * @return The dot product of x and y
+   */
+  template<class T> EXPORTCPUCOREMATH T dot( hoNDArray<T> *x, hoNDArray<T> *y, bool cc = true );
+
+  /**
+   * @brief Calculates the sum of the l1-norms of the array entries
+   * @param[in] arr Input array
+   * @return The l1-norm of the array
+   */
+  template<class T> EXPORTCPUCOREMATH typename realType<T>::Type asum( hoNDArray<T> *x );
+
+  /**
+   * @brief Calculates the sum of the l1-norms of the array entries
+   * @param[in] arr Input array
+   * @return The l1-norm of the array
+   */
+  template<class T> EXPORTCPUCOREMATH T asum( hoNDArray< std::complex<T> > *x );
+
+  /**
+   * @brief Calculates the sum of the l1-norms of the array entries
+   * @param[in] arr Input array
+   * @return The l1-norm of the array
+   */
+  template<class T> EXPORTCPUCOREMATH T asum( hoNDArray< complext<T> > *x );
+
+  /**
+   * @brief Calculates the l2-norm of the array (as a vector)
+   * @param[in] arr Input array
+   * @return The l2-norm of the array
+   */
+  template<class T> EXPORTCPUCOREMATH typename realType<T>::Type nrm2( hoNDArray<T> *x );
+
+  /**
+   * @brief Calculates the l1-norm of the array (as a vector)
+   * @param[in] arr Input array
+   * @return The l1-norm of the array
+   */
+  template<class T> EXPORTCPUCOREMATH typename realType<T>::Type nrm1( hoNDArray<T> *x );
+
+  /**
+   * @brief Returns the index of the array element with the smallest absolute value (l1 norm)
+   * @param[in] x Input data
+   * @return The array index corresponding to the smallest element in the array (0-indexing)
+   */
+  template<class T> EXPORTCPUCOREMATH size_t amin( hoNDArray<T> *x );
+ 
+  /**
+   * @brief Returns the index of the array element with the smallest absolute value (l1 norm)
+   * @param[in] x Input data
+   * @return The array index corresponding to the smallest element in the array (0-indexing)
+   */
+  template<class T> EXPORTCPUCOREMATH size_t amin( hoNDArray< std::complex<T> > *x );
+
+  /**
+   * @brief Returns the index of the array element with the smallest absolute value (l1 norm)
+   * @param[in] x Input data
+   * @return The array index corresponding to the smallest element in the array (0-indexing)
+   */
+  template<class T> EXPORTCPUCOREMATH size_t amin( hoNDArray< complext<T> > *x );
+
+  /**
+   * @brief Returns the index of the array element with the largest absolute value (l1-norm)
+   * @param[in] x Input data
+   * @return The array index corresponding to the largest element in the array (0-indexing)
+   */
+  template<class T> EXPORTCPUCOREMATH size_t amax( hoNDArray<T> *x );
+
+  /**
+   * @brief Returns the index of the array element with the largest absolute value (l1-norm)
+   * @param[in] x Input data
+   * @return The array index corresponding to the largest element in the array (0-indexing)
+   */
+  template<class T> EXPORTCPUCOREMATH size_t amax( hoNDArray< std::complex<T> > *x );
+
+  /**
+   * @brief Returns the index of the array element with the largest absolute value (l1-norm)
+   * @param[in] x Input data
+   * @return The array index corresponding to the largest element in the array (0-indexing)
+   */
+  template<class T> EXPORTCPUCOREMATH size_t amax( hoNDArray< complext<T> > *x );
+
+  /**
+   * @brief Calculates y = a*x+y in which x and y are considered as vectors
+   * @param[in] a Scalar value
+   * @param[in] x Array
+   * @param[in,out] y Array
+   */
+  template<class T> EXPORTCPUCOREMATH void axpy( T a, hoNDArray<T> *x, hoNDArray<T> *y );
+
+  /**
+   * Besides the functions calling the arma, there are some more functions directly calling the MKL routines
+   */
+
+#ifdef USE_MKL
+
+  template<> EXPORTCPUCOREMATH float nrm1( hoNDArray<float> *x );
+  template<> EXPORTCPUCOREMATH double nrm1( hoNDArray<double> *x );
+
+  // BLAS dotc and dotu
+  // res = conj(x) dot y
+  EXPORTCPUCOREMATH GT_Complex8 dotc(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y);
+  EXPORTCPUCOREMATH GT_Complex16 dotc(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y);
+
+  // res = x dot y
+  EXPORTCPUCOREMATH GT_Complex8 dotu(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y);
+  EXPORTCPUCOREMATH GT_Complex16 dotu(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y);
+
+  // other variants for axpy
+  // r = a*x+y
+  EXPORTCPUCOREMATH bool axpy(float a, const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+  EXPORTCPUCOREMATH bool axpy(double a, const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+  EXPORTCPUCOREMATH bool axpy(const GT_Complex8& a, const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+  EXPORTCPUCOREMATH bool axpy(const GT_Complex16& a, const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+
+  // vector-scalar product
+  // r = a*x
+  EXPORTCPUCOREMATH bool scal(float a, hoNDArray<float>& x);
+  EXPORTCPUCOREMATH bool scal(double a, hoNDArray<double>& x);
+  EXPORTCPUCOREMATH bool scal(float a, hoNDArray<GT_Complex8>& x);
+  EXPORTCPUCOREMATH bool scal(double a, hoNDArray<GT_Complex16>& x);
+  EXPORTCPUCOREMATH bool scal(GT_Complex8 a, hoNDArray<GT_Complex8>& x);
+  EXPORTCPUCOREMATH bool scal(GT_Complex16 a, hoNDArray<GT_Complex16>& x);
+
+  EXPORTCPUCOREMATH bool scal(float a, float*x, long long N);
+  EXPORTCPUCOREMATH bool scal(double a, double*x, long long N);
+  EXPORTCPUCOREMATH bool scal(float a, GT_Complex8*x, long long N);
+  EXPORTCPUCOREMATH bool scal(double a, GT_Complex16*x, long long N);
+  EXPORTCPUCOREMATH bool scal(GT_Complex8 a, GT_Complex8*x, long long N);
+  EXPORTCPUCOREMATH bool scal(GT_Complex16 a, GT_Complex16*x, long long N);
+
+  // sort the vector
+  // isascending: true for ascending and false for descending
+  EXPORTCPUCOREMATH bool sort(const hoNDArray<float>& x, hoNDArray<float>& r, bool isascending);
+  EXPORTCPUCOREMATH bool sort(const hoNDArray<double>& x, hoNDArray<double>& r, bool isascending);
+
+#endif // USE_MKL
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_elemwise.cpp b/toolboxes/core/cpu/arma_math/hoNDArray_elemwise.cpp
new file mode 100644
index 0000000..6084f76
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_elemwise.cpp
@@ -0,0 +1,4810 @@
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_operators.h"
+#include "hoNDArray_blas.h"
+#include "complext.h"
+#include "hoArmadillo.h"
+
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::abs(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::abs(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> void abs_inplace( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::abs_inplace(): Invalid input array");
+
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(x);
+        aRes = arma::abs(aRes);
+    }  
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs_square( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::abs_square(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::square(abs(as_arma_col(x)));
+        return result;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > sqrt( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sqrt(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::sqrt(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> void sqrt_inplace( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sqrt_inplace(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        aRes = arma::sqrt(aRes);
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > square( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::square(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::square(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> void square_inplace( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::square_inplace(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        aRes = arma::square(aRes);
+    }  
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > reciprocal( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = ones/as_arma_col(x);
+        return result;
+    }
+
+    template<class T> void reciprocal_inplace( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal_inplace(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();
+        aRes = ones/aRes;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > reciprocal_sqrt( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal_sqrt(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();   
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = ones/arma::sqrt(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> void reciprocal_sqrt_inplace( hoNDArray<T> *x )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::reciprocal_sqrt_inplace(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> ones(x->get_number_of_elements());
+        ones.ones();
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        aRes = ones/arma::sqrt(aRes);
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > sgn( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sgn(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > res( new hoNDArray<T>() );
+        res->create(x->get_dimensions());   
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < res->get_number_of_elements(); i++ ){
+            res->get_data_ptr()[i] = sgn(x->get_data_ptr()[i]);
+        }
+        return res;
+    }
+
+    template<class T> void sgn_inplace( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::sgn_inplace(): Invalid input array");
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < x->get_number_of_elements(); i++ ) 
+            x->get_data_ptr()[i] = sgn(x->get_data_ptr()[i]);
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > real( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::real(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::real(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<typename realType<T>::Type> > imag( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::imag(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<typename realType<T>::Type> > result(new hoNDArray<typename realType<T>::Type>());
+        result->create(x->get_dimensions());
+        arma::Col<typename realType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::imag(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > conj( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::conj(): Invalid input array");
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::conj(as_arma_col(x));
+        return result;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > real_to_complex( hoNDArray<typename realType<T>::Type> *x )
+    {
+        if( x == 0x0 )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::real_to_complex(): Invalid input array"));
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(x->get_dimensions());
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(result.get());
+        aRes = arma::Col<typename stdType<T>::Type>(as_arma_col(x), arma::Col<typename realType<T>::Type>(x->get_number_of_elements()).zeros());
+        return result;
+    }
+
+    template<class T> boost::shared_ptr< hoNDArray<T> > real_imag_to_complex( hoNDArray<typename realType<T>::Type>* real, hoNDArray<typename realType<T>::Type>* imag )
+    {
+        if( real==0x0 || imag==0x0 )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::real_imag_to_complex(): Invalid input array"));
+
+        if( real->get_number_of_elements() != imag->get_number_of_elements() )
+            BOOST_THROW_EXCEPTION(runtime_error("Gadgetron::real_imag_to_complex(): Invalid input array"));
+
+        boost::shared_ptr< hoNDArray<T> > result(new hoNDArray<T>());
+        result->create(real->get_dimensions());
+
+        T* pRes = result->begin();
+
+        size_t N = real->get_number_of_elements();
+        for ( size_t n=0; n<N; n++ )
+        {
+            pRes[n] = T(real->at(n), imag->at(n));
+        }
+
+        return result;
+    }
+
+    template<class T> 
+    bool real_imag_to_complex(const hoNDArray<typename realType<T>::Type>& real, const hoNDArray<typename realType<T>::Type>& imag, hoNDArray<T>& cplx)
+    {
+        try
+        {
+            GADGET_CHECK_RETURN_FALSE(real.dimensions_equal(&imag));
+
+            if ( !cplx.dimensions_equal(&real) )
+            {
+                cplx.create(real.get_dimensions());
+            }
+
+            T* pRes = cplx.begin();
+            const typename realType<T>::Type* pReal = real.begin();
+            const typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<N; n++ )
+            {
+                pRes[n] = T(pReal[n], pImag[n]);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in real_imag_to_complex(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<class T> 
+    bool complex_to_real_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real, hoNDArray<typename realType<T>::Type>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pReal = real.begin();
+            typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<N; n++ )
+            {
+                pReal[n] = pRes[n].real();
+                pImag[n] = pRes[n].imag();
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in complex_to_real_imag(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<> 
+    bool complex_to_real_imag(const hoNDArray<float>& cplx, hoNDArray<float>& real, hoNDArray<float>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const float* pRes = cplx.begin();
+            float* pReal = real.begin();
+            float* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<N; n++ )
+            {
+                pReal[n] = pRes[n];
+                pImag[n] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in complex_to_real_imag(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<> 
+    bool complex_to_real_imag(const hoNDArray<double>& cplx, hoNDArray<double>& real, hoNDArray<double>& imag)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const double* pRes = cplx.begin();
+            double* pReal = real.begin();
+            double* pImag = imag.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal, pImag)
+            for ( n=0; n<N; n++ )
+            {
+                pReal[n] = pRes[n];
+                pImag[n] = 0;
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in complex_to_real_imag(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<class T> 
+    bool complex_to_real(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real)
+    {
+        try
+        {
+            if ( !real.dimensions_equal(&cplx) )
+            {
+                real.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pReal = real.begin();
+
+            size_t N = real.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pReal)
+            for ( n=0; n<N; n++ )
+            {
+                pReal[n] = pRes[n].real();
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in complex_to_real(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<class T> 
+    bool complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& imag)
+    {
+        try
+        {
+            if ( !imag.dimensions_equal(&cplx) )
+            {
+                imag.create(cplx.get_dimensions());
+            }
+
+            const T* pRes = cplx.begin();
+            typename realType<T>::Type* pImag = imag.begin();
+
+            size_t N = imag.get_number_of_elements();
+
+            long long n;
+            #pragma omp parallel for default(none) private(n) shared(N, pRes, pImag)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                pImag[n] = pRes[n].imag();
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in complex_to_imag(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<class T> inline void clear( hoNDArray<T> *x )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clear(): Invalid input array");
+
+        if ( x->get_number_of_elements() > 0 )
+        {
+            memset( x->get_data_ptr(), 0, x->get_number_of_elements()*sizeof(T));
+        }
+    }
+
+    template<class T> inline void clear( hoNDArray<T>& x )
+    {
+        if ( x.get_number_of_elements() > 0 )
+        {
+            memset( x.get_data_ptr(), 0, x.get_number_of_elements()*sizeof(T));
+        }
+    }
+
+    template<class T> void fill( hoNDArray<T> *x, T val )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::fill(): Invalid input array");
+
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(x);
+        aRes.fill(*((typename stdType<T>::Type*)&val));
+    }
+
+    //
+    // TODO:
+    // The clamp functions could (probably) be implemented much like we use Thrust for the device versions
+    // - i.e. using Armadillo's transform on the array.
+    // However this requires a newer version of Armadillo as current Linux distributions provide...
+    //
+
+    template<typename T> struct hoNDA_clamp //: public thrust::unary_function<T,T>
+    {
+      hoNDA_clamp( T _min, T _max, T _min_val, T _max_val ) : min(_min), max(_max), min_val(_min_val), max_val(_max_val) {}
+        T operator()(const T &x) const 
+        {
+            if( x < min ) return min_val;
+            else if ( x >= max) return max_val;
+            else return x;
+        }
+      T min, max;
+      T min_val, max_val;
+    };
+
+    template<typename T> struct hoNDA_clamp< std::complex<T> > //: public thrust::unary_function< std::complex<T>, std::complex<T> >
+    {
+      hoNDA_clamp( T _min, T _max, std::complex<T> _min_val, std::complex<T> _max_val ) : min(_min), max(_max), min_val(_min_val), max_val(_max_val) {}
+        std::complex<T> operator()(const std::complex<T> &x) const 
+        {
+            if( real(x) < min ) return min_val;
+            else if ( real(x) >= max) return max_val;
+            else return std::complex<T>(real(x));
+        }
+      T min, max;
+      std::complex<T> min_val, max_val;
+    };
+
+    template<typename T> struct hoNDA_clamp< complext<T> > //: public thrust::unary_function< complext<T>, complext<T> >
+    {
+        hoNDA_clamp( T _min, T _max, complext<T> _min_val, complext<T> _max_val ) : min(_min), max(_max), min_val(_min_val), max_val(_max_val) {}
+        complext<T> operator()(const complext<T> &x) const 
+        {
+            if( real(x) < min ) return min_val;
+            else if ( real(x) >= max) return max_val;
+            else return complext<T>(real(x));
+        }
+        T min, max;
+        complext<T> min_val, max_val;
+    };
+
+    template<class T> void clamp( hoNDArray<T> *x, 
+                                  typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clamp(): Invalid input array");
+
+        hoNDA_clamp<T> functor(min, max, min_val, max_val);
+        std::transform(x->begin(),x->end(),x->begin(),functor);
+    }  
+
+    template<class T> void clamp( hoNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max )
+    {
+        clamp(x,min,max,T(min),T(max));
+    }
+
+    template<typename T> struct hoNDA_clamp_min //: public thrust::unary_function<T,T>
+    {
+        hoNDA_clamp_min( T _min ) : min(_min) {}
+        T operator()(const T &x) const 
+        {
+            if( x < min ) return min;
+            else return x;
+        }
+        T min;
+    };
+
+    template<typename T> struct hoNDA_clamp_min< std::complex<T> > //: public thrust::unary_function< std::complex<T>, std::complex<T> >
+    {
+        hoNDA_clamp_min( T _min ) : min(_min) {}
+        std::complex<T> operator()(const std::complex<T> &x) const 
+        {
+            if( real(x) < min ) return std::complex<T>(min);
+            else return std::complex<T>(real(x));
+        }
+        T min;
+    };
+
+    template<typename T> struct hoNDA_clamp_min< complext<T> > //: public thrust::unary_function< complext<T>, complext<T> >
+    {
+        hoNDA_clamp_min( T _min ) : min(_min) {}
+        complext<T> operator()(const complext<T> &x) const 
+        {
+            if( real(x) < min ) return complext<T>(min);
+            else return complext<T>(real(x));
+        }
+        T min;
+    };
+
+    template<class T> void clamp_min( hoNDArray<T> *x, typename realType<T>::Type min )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clamp_min(): Invalid input array");
+
+        hoNDA_clamp_min<T> functor(min);
+        std::transform(x->begin(),x->end(),x->begin(),functor);
+    }  
+
+    template<typename T> struct hoNDA_clamp_max //: public thrust::unary_function<T,T>
+    {
+        hoNDA_clamp_max( T _max ) : max(_max) {}
+        T operator()(const T &x) const 
+        {
+            if( x > max ) return max;
+            else return x;
+        }
+        T max;
+    };
+
+    template<typename T> struct hoNDA_clamp_max< std::complex<T> > //: public thrust::unary_function< std::complex<T>, std::complex<T> >
+    {
+        hoNDA_clamp_max( T _max ) : max(_max) {}
+        std::complex<T> operator()(const std::complex<T> &x) const 
+        {
+            if( real(x) > max ) return std::complex<T>(max);
+            else return std::complex<T>(real(x));
+        }
+        T max;
+    };
+
+    template<typename T> struct hoNDA_clamp_max< complext<T> > //: public thrust::unary_function< complext<T>, complext<T> >
+    {
+        hoNDA_clamp_max( T _max ) : max(_max) {}
+        complext<T> operator()(const complext<T> &x) const 
+        {
+            if( real(x) > max ) return complext<T>(max);
+            else return complext<T>(real(x));
+        }
+        T max;
+    };
+
+    template<class T> void clamp_max( hoNDArray<T> *x, typename realType<T>::Type max )
+    { 
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::clamp_max(): Invalid input array");
+
+        hoNDA_clamp_max<T> functor(max);
+        std::transform(x->begin(),x->end(),x->begin(),functor);
+    }
+
+    template<class T> void normalize( hoNDArray<T> *x, typename realType<T>::Type val )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::normalize(): Invalid input array");
+
+        size_t max_idx = amax(x);
+        T max_val_before = x->get_data_ptr()[max_idx];
+        typename realType<T>::Type scale = val/abs(max_val_before);
+        *x *= scale;
+    }
+
+    template<class T> void shrink1( hoNDArray<T> *x, typename realType<T>::Type gamma, hoNDArray<T> *out )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::shrink1(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < x->get_number_of_elements(); i++ ) {
+            T prev = x->get_data_ptr()[i];
+            typename realType<T>::Type absPrev = abs(prev);
+            T sgnPrev = (absPrev <= typename realType<T>::Type(0)) ? T(0) : prev/absPrev;
+            outPtr[i] = sgnPrev*std::max(absPrev-gamma, typename realType<T>::Type(0));
+        } 
+    }
+
+    template<class T> void pshrink( hoNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out )
+    {
+        if( x == 0x0 )
+            throw std::runtime_error("Gadgetron::pshrink(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < x->get_number_of_elements(); i++ ) {
+            T prev = x->get_data_ptr()[i];
+            typename realType<T>::Type absPrev = abs(prev);
+            T sgnPrev = (absPrev <= typename realType<T>::Type(0)) ? T(0) : prev/absPrev;
+            outPtr[i] = sgnPrev*std::max(absPrev-gamma*std::pow(absPrev,p-1), typename realType<T>::Type(0));
+        }
+    }
+
+    template<class T> void shrinkd ( hoNDArray<T> *_x, hoNDArray<typename realType<T>::Type> *_s, typename realType<T>::Type gamma, hoNDArray<T> *out )
+    {
+        if( _x == 0x0  || _s == 0 )
+            throw std::runtime_error("Gadgetron::shrinkd(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? _x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < _x->get_number_of_elements(); i++ ) {
+            T x = _x->get_data_ptr()[i];
+            typename realType<T>::Type s = _s->get_data_ptr()[i];
+            if (s > gamma)
+            	outPtr[i] = x/s*(s-gamma);
+            else
+            	outPtr[i] = 0;
+        } 
+    }
+
+    template<class T> void pshrinkd( hoNDArray<T> *_x, hoNDArray<typename realType<T>::Type> *_s, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out )
+    {
+        if( _x == 0x0 )
+            throw std::runtime_error("Gadgetron::pshrinkd(): Invalid input array");
+
+        T *outPtr = (out==0x0) ? _x->get_data_ptr() : out->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+        for( long long i = 0; i < _x->get_number_of_elements(); i++ )
+        {
+            T x = _x->get_data_ptr()[i];
+            typename realType<T>::Type s = _s->get_data_ptr()[i];
+            outPtr[i] = x/s*std::max(s-gamma*std::pow(s,p-1),typename realType<T>::Type(0));
+        }
+    }
+
+    #ifdef USE_MKL
+
+    // ----------------------------------------------------------------------------------------
+    // float
+    // ----------------------------------------------------------------------------------------
+
+    bool add(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vsAdd(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool subtract(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vsSub(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool multiply(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vsMul(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool divide(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vsDiv(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool absolute(const hoNDArray<float>& x, hoNDArray<float>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vsAbs(x.get_number_of_elements(), x.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool argument(const hoNDArray<float>& x, hoNDArray<float>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        memset(r.begin(), 0, r.get_number_of_bytes());
+
+        return true;
+    }
+
+    bool sqrt(const hoNDArray<float>& x, hoNDArray<float>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vsSqrt(x.get_number_of_elements(), x.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool minAbsolute(const hoNDArray<float>& x, float& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(isamin(&n, x.begin(), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool maxAbsolute(const hoNDArray<float>& x, float& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(isamax(&n, x.begin(), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool addEpsilon(hoNDArray<float>& x)
+    {
+        try
+        {
+            size_t n = x.get_number_of_elements();
+            float* pX = x.begin();
+
+            long long i;
+
+            #pragma omp parallel for default(none) private(i) shared(n, pX)
+            for (i=0; i<(long long)n; i++ )
+            {
+                if ( GT_ABS(pX[i]) < FLT_EPSILON )
+                {
+                    pX[i] += GT_SGN(pX[i])*FLT_EPSILON;
+                }
+            }
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm2(const hoNDArray<float>& x, float& r)
+    {
+        try
+        {
+            MKL_INT incx = 1;
+            MKL_INT n = x.get_number_of_elements();
+            r = snrm2(&n, x.begin(), &incx);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm1(const hoNDArray<float>& x, float& r)
+    {
+        try
+        {
+            MKL_INT incx = 1;
+            MKL_INT n = x.get_number_of_elements();
+            r = sasum(&n, x.begin(), &incx);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv2(const hoNDArray<float>& x, const hoNDArray<float>& ker, hoNDArray<float>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+
+            size_t num = x.get_number_of_elements()/(RO*E1);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[2];
+            kerShape[0] = kerRO; kerShape[1] = kerE1;
+
+            MKL_INT xshape[2];
+            xshape[0] = RO; xshape[1] = E1;
+
+            MKL_INT start[2];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+
+            MKL_INT kerStride[2], xstride[2], zstride[2];
+            kerStride[0] = 1; kerStride[1] = kerRO;
+            xstride[0] = 1; xstride[1] = RO;
+            zstride[0] = 1; zstride[1] = RO;
+
+            const float* pX = x.begin();
+            const float* pKer = ker.begin();
+            float* pZ = z.begin();
+
+            if ( num == 1 )
+            {
+                status = vslsConvNewTask(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslsConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vslsConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vslsConvExecX(task, pX+n*RO*E1, xstride, pZ+n*RO*E1, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv2(const hoNDArray<float>& x, const hoNDArray<float>& ker, hoNDArray<float>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv3(const hoNDArray<float>& x, const hoNDArray<float>& ker, hoNDArray<float>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+            size_t kerE2 = ker.get_size(2);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*E2);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[3];
+            kerShape[0] = kerRO; kerShape[1] = kerE1; kerShape[2] = kerE2;
+
+            MKL_INT xshape[3];
+            xshape[0] = RO; xshape[1] = E1; xshape[2] = E2;
+
+            MKL_INT start[3];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+            start[2] = kerE2/2;
+
+            MKL_INT kerStride[3], xstride[3], zstride[3];
+            kerStride[0] = 1; kerStride[1] = kerRO; kerStride[2] = kerRO*kerE1;
+            xstride[0] = 1; xstride[1] = RO; xstride[2] = RO*E1;
+            zstride[0] = 1; zstride[1] = RO; zstride[2] = RO*E1;
+
+            const float* pX = x.begin();
+            const float* pKer = ker.begin();
+            float* pZ = z.begin();
+
+            if ( num == 1 )
+            {
+                status = vslsConvNewTask(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslsConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vslsConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, E2, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vslsConvExecX(task, pX+n*RO*E1*E2, xstride, pZ+n*RO*E1*E2, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv3(const hoNDArray<float>& x, const hoNDArray<float>& ker, hoNDArray<float>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool inv(const hoNDArray<float>& x, hoNDArray<float>& r)
+    {
+        try
+        {
+            if ( !r.dimensions_equal(&x) )
+            {
+                r = x;
+            }
+
+            long long n = x.get_number_of_elements();
+            vsInv(n, x.begin(), r.begin());
+            GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in inv(const hoNDArray<float>& x, hoNDArray<float>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // ----------------------------------------------------------------------------------------
+    // double
+    // ----------------------------------------------------------------------------------------
+
+    bool add(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vdAdd(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool subtract(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vdSub(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool multiply(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vdMul(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool divide(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vdDiv(x.get_number_of_elements(), x.begin(), y.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool absolute(const hoNDArray<double>& x, hoNDArray<double>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vdAbs(x.get_number_of_elements(), x.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool argument(const hoNDArray<double>& x, hoNDArray<double>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        memset(r.begin(), 0, r.get_number_of_bytes());
+
+        return true;
+    }
+
+    bool sqrt(const hoNDArray<double>& x, hoNDArray<double>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vdSqrt(x.get_number_of_elements(), x.begin(), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool minAbsolute(const hoNDArray<double>& x, double& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(idamin(&n, x.begin(), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool maxAbsolute(const hoNDArray<double>& x, double& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(idamax(&n, x.begin(), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool addEpsilon(hoNDArray<double>& x)
+    {
+        try
+        {
+            size_t n = x.get_number_of_elements();
+            double* pX = x.begin();
+
+            long long i;
+
+            #pragma omp parallel for default(none) private(i) shared(n, pX)
+            for (i=0; i<(long long)n; i++ )
+            {
+                if ( GT_ABS(pX[i]) < DBL_EPSILON )
+                {
+                    pX[i] += GT_SGN(pX[i])*DBL_EPSILON;
+                }
+            }
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm2(const hoNDArray<double>& x, double& r)
+    {
+        try
+        {
+            MKL_INT incx = 1;
+            MKL_INT n = x.get_number_of_elements();
+            r = dnrm2(&n, x.begin(), &incx);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm1(const hoNDArray<double>& x, double& r)
+    {
+        try
+        {
+            MKL_INT incx = 1;
+            MKL_INT n = x.get_number_of_elements();
+            r = dasum(&n, x.begin(), &incx);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv2(const hoNDArray<double>& x, const hoNDArray<double>& ker, hoNDArray<double>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+
+            size_t num = x.get_number_of_elements()/(RO*E1);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[2];
+            kerShape[0] = kerRO; kerShape[1] = kerE1;
+
+            MKL_INT xshape[2];
+            xshape[0] = RO; xshape[1] = E1;
+
+            MKL_INT start[2];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+
+            MKL_INT kerStride[2], xstride[2], zstride[2];
+            kerStride[0] = 1; kerStride[1] = kerRO;
+            xstride[0] = 1; xstride[1] = RO;
+            zstride[0] = 1; zstride[1] = RO;
+
+            const double* pX = x.begin();
+            const double* pKer = ker.begin();
+            double* pZ = z.begin();
+
+            if ( num == 1 )
+            {
+                status = vsldConvNewTask(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vsldConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vsldConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vsldConvExecX(task, pX+n*RO*E1, xstride, pZ+n*RO*E1, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv2(const hoNDArray<double>& x, const hoNDArray<double>& ker, hoNDArray<double>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv3(const hoNDArray<double>& x, const hoNDArray<double>& ker, hoNDArray<double>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+            size_t kerE2 = ker.get_size(2);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*E2);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[3];
+            kerShape[0] = kerRO; kerShape[1] = kerE1; kerShape[2] = kerE2;
+
+            MKL_INT xshape[3];
+            xshape[0] = RO; xshape[1] = E1; xshape[2] = E2;
+
+            MKL_INT start[3];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+            start[2] = kerE2/2;
+
+            MKL_INT kerStride[3], xstride[3], zstride[3];
+            kerStride[0] = 1; kerStride[1] = kerRO; kerStride[2] = kerRO*kerE1;
+            xstride[0] = 1; xstride[1] = RO; xstride[2] = RO*E1;
+            zstride[0] = 1; zstride[1] = RO; zstride[2] = RO*E1;
+
+            const double* pX = x.begin();
+            const double* pKer = ker.begin();
+            double* pZ = z.begin();
+
+            if ( num == 1 )
+            {
+                status = vsldConvNewTask(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vsldConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vsldConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, E2, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vsldConvExecX(task, pX+n*RO*E1*E2, xstride, pZ+n*RO*E1*E2, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv3(const hoNDArray<double>& x, const hoNDArray<double>& ker, hoNDArray<double>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool inv(const hoNDArray<double>& x, hoNDArray<double>& r)
+    {
+        try
+        {
+            if ( !r.dimensions_equal(&x) )
+            {
+                r = x;
+            }
+
+            long long n = x.get_number_of_elements();
+            vdInv(n, x.begin(), r.begin());
+            GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in inv(const hoNDArray<double>& x, hoNDArray<double>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // ----------------------------------------------------------------------------------------
+    // GT_Complex8
+    // ----------------------------------------------------------------------------------------
+
+    bool add(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vcAdd(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool subtract(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vcSub(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool multiply(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vcMul(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool divide(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vcDiv(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool absolute(const hoNDArray<GT_Complex8>& x, hoNDArray<float>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vcAbs(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool absolute(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        hoNDArray<float> rTmp;
+        rTmp.create(x.get_dimensions());
+
+        vcAbs(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), rTmp.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        //GADGET_CHECK_RETURN_FALSE(r.copyFrom(rTmp));
+	r.copyFrom(rTmp);
+
+        return true;
+    }
+
+    bool sqrt(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vcSqrt(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool minAbsolute(const hoNDArray<GT_Complex8>& x, GT_Complex8& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(icamin(&n, reinterpret_cast<const MKL_Complex8*>(x.begin()), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool maxAbsolute(const hoNDArray<GT_Complex8>& x, GT_Complex8& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(icamax(&n, reinterpret_cast<const MKL_Complex8*>(x.begin()), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool multiplyConj(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vcMulByConj(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool argument(const hoNDArray<GT_Complex8>& x, hoNDArray<float>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vcArg(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool conjugate(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vcConj(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<MKL_Complex8*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool addEpsilon(hoNDArray<GT_Complex8>& x)
+    {
+        try
+        {
+            size_t n = x.get_number_of_elements();
+            GT_Complex8* pX = x.begin();
+
+            long long i;
+
+            #pragma omp parallel for default(none) private(i) shared(n, pX)
+            for (i=0; i<(long long)n; i++ )
+            {
+                if ( std::abs(pX[i]) < FLT_EPSILON )
+                {
+                    pX[i] += FLT_EPSILON;
+                }
+            }
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm2(const hoNDArray<GT_Complex8>& x, float& r)
+    {
+        try
+        {
+            MKL_INT incx = 1;
+            MKL_INT n = x.get_number_of_elements();
+            r = scnrm2(&n, reinterpret_cast<const MKL_Complex8*>(x.begin()), &incx);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm1(const hoNDArray<GT_Complex8>& x, float& r)
+    {
+        try
+        {
+            hoNDArray<float> a;
+            GADGET_CHECK_RETURN_FALSE(absolute(x, a));
+            GADGET_CHECK_RETURN_FALSE(norm1(a, r));
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool dotc(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, GT_Complex8& r)
+    {
+        try
+        {
+            GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+
+            MKL_INT N = x.get_number_of_elements();
+            MKL_INT incx(1), incy(1);
+            cdotc(reinterpret_cast<MKL_Complex8*>(&r), &N, reinterpret_cast<const MKL_Complex8*>(x.begin()), &incx, 
+                    reinterpret_cast<const MKL_Complex8*>(y.begin()), &incy);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv2(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& ker, hoNDArray<GT_Complex8>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+
+            size_t num = x.get_number_of_elements()/(RO*E1);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[2];
+            kerShape[0] = kerRO; kerShape[1] = kerE1;
+
+            MKL_INT xshape[2];
+            xshape[0] = RO; xshape[1] = E1;
+
+            MKL_INT start[2];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+
+            MKL_INT kerStride[2], xstride[2], zstride[2];
+            kerStride[0] = 1; kerStride[1] = kerRO;
+            xstride[0] = 1; xstride[1] = RO;
+            zstride[0] = 1; zstride[1] = RO;
+
+            const MKL_Complex8* pX = reinterpret_cast<const MKL_Complex8*>(x.begin());
+            const MKL_Complex8* pKer = reinterpret_cast<const MKL_Complex8*>(ker.begin());
+            MKL_Complex8* pZ = reinterpret_cast<MKL_Complex8*>(z.begin());
+
+            if ( num == 1 )
+            {
+                status = vslcConvNewTask(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslcConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vslcConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vslcConvExecX(task, pX+n*RO*E1, xstride, pZ+n*RO*E1, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv2(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& ker, hoNDArray<GT_Complex8>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv3(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& ker, hoNDArray<GT_Complex8>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+            size_t kerE2 = ker.get_size(2);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*E2);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[3];
+            kerShape[0] = kerRO; kerShape[1] = kerE1; kerShape[2] = kerE2;
+
+            MKL_INT xshape[3];
+            xshape[0] = RO; xshape[1] = E1; xshape[2] = E2;
+
+            MKL_INT start[3];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+            start[2] = kerE2/2;
+
+            MKL_INT kerStride[3], xstride[3], zstride[3];
+            kerStride[0] = 1; kerStride[1] = kerRO; kerStride[2] = kerRO*kerE1;
+            xstride[0] = 1; xstride[1] = RO; xstride[2] = RO*E1;
+            zstride[0] = 1; zstride[1] = RO; zstride[2] = RO*E1;
+
+            const MKL_Complex8* pX = reinterpret_cast<const MKL_Complex8*>(x.begin());
+            const MKL_Complex8* pKer = reinterpret_cast<const MKL_Complex8*>(ker.begin());
+            MKL_Complex8* pZ = reinterpret_cast<MKL_Complex8*>(z.begin());
+
+            if ( num == 1 )
+            {
+                status = vslcConvNewTask(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslcConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vslcConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, E2, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vslcConvExecX(task, pX+n*RO*E1*E2, xstride, pZ+n*RO*E1*E2, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv3(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& ker, hoNDArray<GT_Complex8>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool inv(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r)
+    {
+        try
+        {
+            if ( !r.dimensions_equal(&x) )
+            {
+                r = x;
+            }
+
+            const GT_Complex8* pX = x.begin();
+            GT_Complex8* pR = r.begin();
+
+            GT_Complex8 v(1.0);
+            long long n = x.get_number_of_elements();
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(n, pX, pR, v)
+            for ( ii=0; ii<n; ii++ )
+            {
+                pR[ii] = v/pX[ii];
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in inv(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // ----------------------------------------------------------------------------------------
+    // GT_Complex16
+    // ----------------------------------------------------------------------------------------
+
+    bool add(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vzAdd(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool subtract(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vzSub(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool multiply(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vzMul(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool divide(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vzDiv(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool absolute(const hoNDArray<GT_Complex16>& x, hoNDArray<double>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vzAbs(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool absolute(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        hoNDArray<double> rTmp;
+        rTmp.create(x.get_dimensions());
+
+        vzAbs(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), rTmp.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        //GADGET_CHECK_RETURN_FALSE(r.copyFrom(rTmp));
+	r.copyFrom(rTmp);
+
+        return true;
+    }
+
+    bool sqrt(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vzSqrt(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool minAbsolute(const hoNDArray<GT_Complex16>& x, GT_Complex16& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(izamin(&n, reinterpret_cast<const MKL_Complex16*>(x.begin()), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool maxAbsolute(const hoNDArray<GT_Complex16>& x, GT_Complex16& r, size_t& ind)
+    {
+        try
+        {
+            MKL_INT n = x.get_number_of_elements();
+            MKL_INT incx = 1;
+            ind = (size_t)(izamax(&n, reinterpret_cast<const MKL_Complex16*>(x.begin()), &incx));
+            r = x.at(ind);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool multiplyConj(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r = x;
+        }
+
+        vzMulByConj(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool argument(const hoNDArray<GT_Complex16>& x, hoNDArray<double>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vzArg(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), r.begin());
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool conjugate(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r)
+    {
+        if ( r.get_number_of_elements()!=x.get_number_of_elements())
+        {
+            r.create(x.get_dimensions());
+        }
+
+        vzConj(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<MKL_Complex16*>(r.begin()));
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    bool addEpsilon(hoNDArray<GT_Complex16>& x)
+    {
+        try
+        {
+            size_t n = x.get_number_of_elements();
+            GT_Complex16* pX = x.begin();
+
+            long long i;
+
+            #pragma omp parallel for default(none) private(i) shared(n, pX)
+            for (i=0; i<(long long)n; i++ )
+            {
+                if ( std::abs(pX[i]) < DBL_EPSILON )
+                {
+                    pX[i] += DBL_EPSILON;
+                }
+            }
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm2(const hoNDArray<GT_Complex16>& x, double& r)
+    {
+        try
+        {
+            MKL_INT incx = 1;
+            MKL_INT n = x.get_number_of_elements();
+            r = dznrm2(&n, reinterpret_cast<const MKL_Complex16*>(x.begin()), &incx);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool norm1(const hoNDArray<GT_Complex16>& x, double& r)
+    {
+        try
+        {
+            hoNDArray<double> a;
+            GADGET_CHECK_RETURN_FALSE(absolute(x, a));
+            GADGET_CHECK_RETURN_FALSE(norm1(a, r));
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool dotc(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, GT_Complex16& r)
+    {
+        try
+        {
+            GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()==y.get_number_of_elements());
+
+            MKL_INT N = x.get_number_of_elements();
+            MKL_INT incx(1), incy(1);
+            zdotc(reinterpret_cast<MKL_Complex16*>(&r), &N, reinterpret_cast<const MKL_Complex16*>(x.begin()), &incx, 
+                    reinterpret_cast<const MKL_Complex16*>(y.begin()), &incy);
+        }
+        catch(...)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv2(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& ker, hoNDArray<GT_Complex16>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+
+            size_t num = x.get_number_of_elements()/(RO*E1);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[2];
+            kerShape[0] = kerRO; kerShape[1] = kerE1;
+
+            MKL_INT xshape[2];
+            xshape[0] = RO; xshape[1] = E1;
+
+            MKL_INT start[2];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+
+            MKL_INT kerStride[2], xstride[2], zstride[2];
+            kerStride[0] = 1; kerStride[1] = kerRO;
+            xstride[0] = 1; xstride[1] = RO;
+            zstride[0] = 1; zstride[1] = RO;
+
+            const MKL_Complex16* pX = reinterpret_cast<const MKL_Complex16*>(x.begin());
+            const MKL_Complex16* pKer = reinterpret_cast<const MKL_Complex16*>(ker.begin());
+            MKL_Complex16* pZ = reinterpret_cast<MKL_Complex16*>(z.begin());
+
+            if ( num == 1 )
+            {
+                status = vslzConvNewTask(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslzConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vslzConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 2, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vslzConvExecX(task, pX+n*RO*E1, xstride, pZ+n*RO*E1, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv2(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& ker, hoNDArray<GT_Complex16>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool conv3(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& ker, hoNDArray<GT_Complex16>& z)
+    {
+        try
+        {
+            if ( !z.dimensions_equal(&x) )
+            {
+                z = x;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            size_t kerRO = ker.get_size(0);
+            size_t kerE1 = ker.get_size(1);
+            size_t kerE2 = ker.get_size(2);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*E2);
+
+            int status;
+            VSLConvTaskPtr task;
+
+            MKL_INT kerShape[3];
+            kerShape[0] = kerRO; kerShape[1] = kerE1; kerShape[2] = kerE2;
+
+            MKL_INT xshape[3];
+            xshape[0] = RO; xshape[1] = E1; xshape[2] = E2;
+
+            MKL_INT start[3];
+            start[0] = kerRO/2;
+            start[1] = kerE1/2;
+            start[2] = kerE2/2;
+
+            MKL_INT kerStride[3], xstride[3], zstride[3];
+            kerStride[0] = 1; kerStride[1] = kerRO; kerStride[2] = kerRO*kerE1;
+            xstride[0] = 1; xstride[1] = RO; xstride[2] = RO*E1;
+            zstride[0] = 1; zstride[1] = RO; zstride[2] = RO*E1;
+
+            const MKL_Complex16* pX = reinterpret_cast<const MKL_Complex16*>(x.begin());
+            const MKL_Complex16* pKer = reinterpret_cast<const MKL_Complex16*>(ker.begin());
+            MKL_Complex16* pZ = reinterpret_cast<MKL_Complex16*>(z.begin());
+
+            if ( num == 1 )
+            {
+                status = vslzConvNewTask(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslzConvExec(task, pKer, kerStride, pX, xstride, pZ, zstride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                 vslConvDeleteTask(&task);
+            }
+            else
+            {
+                status = vslzConvNewTaskX(&task, VSL_CONV_MODE_AUTO, 3, kerShape, xshape, xshape, pKer, kerStride);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                status = vslConvSetStart(task, start);
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                long long n;
+
+                #pragma omp parallel for default(none) private(n) shared(num, task, pX, RO, E1, E2, status, xstride, pZ, zstride)
+                for ( n=0; n<(long long)num; n++ )
+                {
+                    status = vslzConvExecX(task, pX+n*RO*E1*E2, xstride, pZ+n*RO*E1*E2, zstride);
+                }
+                GADGET_CHECK_RETURN_FALSE(status==VSL_STATUS_OK);
+
+                vslConvDeleteTask(&task);
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in conv3(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& ker, hoNDArray<GT_Complex16>& z) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool inv(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r)
+    {
+        try
+        {
+            if ( !r.dimensions_equal(&x) )
+            {
+                r = x;
+            }
+
+            const GT_Complex16* pX = x.begin();
+            GT_Complex16* pR = r.begin();
+
+            GT_Complex16 v(1.0);
+            long long n = x.get_number_of_elements();
+            long long ii;
+
+            #pragma omp parallel for default(none) private(ii) shared(n, pX, pR, v)
+            for ( ii=0; ii<n; ii++ )
+            {
+                pR[ii] = v/pX[ii];
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors happened in inv(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // ----------------------------------------------------------------------------------------
+    // templated functions
+    // ----------------------------------------------------------------------------------------
+
+    template<typename T> 
+    bool sumOverLastDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+            size_t NDim = dim->size();
+
+            std::vector<size_t> dimR(NDim-1);
+
+            size_t d;
+            for ( d=0; d<NDim-1; d++ )
+            {
+                dimR[d] = (*dim)[d];
+            }
+
+            if ( !r.dimensions_equal(&dimR) )
+            {
+                r.create(&dimR);
+            }
+
+            // Gadgetron::clear(&r);
+
+            if ( x.get_size(NDim-1) <= 1 )
+            {
+                memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+                return true;
+            }
+
+            size_t lastDim = x.get_size(NDim-1);
+            size_t NR = r.get_number_of_elements();
+            T* pA = const_cast<T*>(x.begin());
+            T* pR = r.begin();
+
+            memcpy(pR, pA, sizeof(T)*NR);
+
+            // sum over the last dim
+            hoNDArray<T> tmp;
+            for ( d=1; d<lastDim; d++ )
+            {
+                tmp.create(&dimR, pA+d*NR);
+                add(tmp, r, r);
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOverLastDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool sumOverSecondLastDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+            size_t NDim = dim->size();
+
+            if ( NDim < 2 ) return true;
+
+            std::vector<size_t> dimR(NDim-1);
+            std::vector<size_t> dimRInternal(NDim-2);
+
+            size_t d;
+            for ( d=0; d<NDim-2; d++ )
+            {
+                dimR[d] = (*dim)[d];
+                dimRInternal[d] = (*dim)[d];
+            }
+            dimR[NDim-2] = (*dim)[NDim-1];
+
+            if ( !r.dimensions_equal(&dimR) )
+            {
+                r.create(&dimR);
+            }
+
+            if ( x.get_size(NDim-2) <= 1 )
+            {
+                memcpy(r.begin(), x.begin(), x.get_number_of_bytes());
+                return true;
+            }
+
+            size_t lastDim = x.get_size(NDim-1);
+            size_t secondLastDim = x.get_size(NDim-2);
+            size_t NS = x.get_number_of_elements()/lastDim;
+            size_t NR = r.get_number_of_elements()/lastDim;
+            T* pA = const_cast<T*>(x.begin());
+            T* pR = r.begin();
+
+            //int l;
+            //#pragma omp parallel default(none) private(l) shared(lastDim, secondLastDim, NR, pA, pR, dimRInternal)
+            //{
+            //    hoNDArray<T> tmp(&dimRInternal);
+
+            //    #pragma omp for
+            //    for ( l=0; l<(int)lastDim; l++ )
+            //    {
+            //        memcpy(tmp.begin(), pA+l*NR*secondLastDim, sizeof(T)*NR);
+            //        for ( size_t s=1; s<secondLastDim; s++ )
+            //        {
+            //            hoNDArray<T> tmp2;
+            //            tmp2.create(&dimRInternal, pA+l*NR*secondLastDim+s*NR);
+            //            add(tmp, tmp2, tmp);
+            //        }
+
+            //        memcpy(pR+l*NR, tmp.begin(), sizeof(T)*NR);
+            //    }
+            //}
+
+            int l;
+            #pragma omp parallel default(none) private(l) shared(lastDim, secondLastDim, NS, NR, pA, pR, dimRInternal)
+            {
+                hoNDArray<T> tmp, tmp2;
+
+                #pragma omp for
+                for ( l=0; l<(int)lastDim; l++ )
+                {
+                    memcpy(pR+l*NR, pA+l*NS, sizeof(T)*NR);
+                    tmp.create(&dimRInternal, pR+l*NR);
+                    for ( size_t s=1; s<secondLastDim; s++ )
+                    {
+                        tmp2.create(&dimRInternal, pA+l*NS+s*NR);
+                        add(tmp, tmp2, tmp);
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOverSecondLastDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    // e.g. x is 3D and y is 4D array, r(:,:,:,n) = y(:,:,:,n) .* x
+    template<typename T> 
+    bool multiplyOverLastDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()==NDim-1);
+
+            if ( !r.dimensions_equal(dimY.get()) )
+            {
+                r.create(dimY);
+            }
+
+            if ( y.get_size(NDim-1) <= 1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(multiply(x, y, r));
+                return true;
+            }
+
+            size_t lastDim = y.get_size(NDim-1);
+            size_t N = x.get_number_of_elements();
+            const T* pX = x.begin();
+            const T* pY = y.begin();
+            T* pR = r.begin();
+
+            int d;
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(d) shared(dimX, lastDim, N, pY, pR)
+            #else
+                #pragma omp parallel default(none) private(d) shared(x, dimX, lastDim, N, pY, pR)
+            #endif
+            {
+                hoNDArray<T> tmpY, tmpR;
+
+                #pragma omp for
+                for ( d=0; d<(int)lastDim; d++ )
+                {
+                    tmpY.create(dimX.get(), const_cast<T*>(pY+d*N));
+                    tmpR.create(dimX.get(), pR+d*N);
+                    multiply(x, tmpY, tmpR);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in multiplyOverLastDimension(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    // e.g. x is 3D and y is 4D array, r(:,:,:,n) = y(:,:,:,n) ./ x
+    template<typename T> 
+    bool divideOverLastDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()==NDim-1);
+
+            if ( !r.dimensions_equal(dimY.get()) )
+            {
+                r.create(dimY);
+            }
+
+            if ( y.get_size(NDim-1) <= 1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(divide(y, x, r));
+                return true;
+            }
+
+            size_t lastDim = y.get_size(NDim-1);
+            size_t N = x.get_number_of_elements();
+            T* pY = const_cast<T*>(y.begin());
+            T* pR = r.begin();
+
+            int d;
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(d) shared(dimX, lastDim, N, pY, pR)
+            #else
+                #pragma omp parallel default(none) private(d) shared(x, dimX, lastDim, N, pY, pR)
+            #endif
+            {
+                hoNDArray<T> tmpY, tmpR;
+
+                #pragma omp for
+                for ( d=0; d<(int)lastDim; d++ )
+                {
+                    tmpY.create(dimX, pY+d*N);
+                    tmpR.create(dimX, pR+d*N);
+                    divide(tmpY, x, tmpR);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in divideOverLastDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool sumOver1stDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            size_t RO = x.get_size(0);
+            size_t num = x.get_number_of_elements()/(RO);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimAve(*dim);
+            dimAve[0] = 1;
+            r.create(&dimAve);
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            int n;
+            #pragma omp parallel for default(none) private(n) shared(RO, num, pX, pR)
+            for ( n=0; n<(int)num; n++ )
+            {
+                T xsum = pX[n*RO];
+                for (size_t ro=1; ro<RO; ro++ )
+                {
+                    xsum += pX[n*RO+ro];
+                }
+
+                pR[n] = xsum;
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOver1stDimension(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool sumOver2ndDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            size_t NDim = x.get_number_of_dimensions();
+
+            if ( NDim < 2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+
+            size_t num = x.get_number_of_elements()/(RO*E1);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimAve(*dim);
+            dimAve[1] = 1;
+            r.create(&dimAve);
+
+            int n;
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, num)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, num, x, r)
+            #endif
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> xsum(RO, const_cast<T*>(r.begin()+n*RO));
+                memcpy(xsum.begin(), x.begin()+n*RO*E1, xsum.get_number_of_bytes());
+
+                for (size_t e1=1; e1<E1; e1++ )
+                {
+                    hoNDArray<T> x1D(RO, const_cast<T*>(x.begin()+n*RO*E1+e1*RO));
+                    Gadgetron::add(x1D, xsum, xsum);
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOver2ndDimension(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool sumOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            size_t NDim = x.get_number_of_dimensions();
+
+            if ( NDim < 3 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*CHA);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimAve(*dim);
+            dimAve[2] = 1;
+            r.create(&dimAve);
+
+            int n;
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, CHA, num)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, CHA, num, x, r)
+            #endif 
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> xsum(RO, E1, const_cast<T*>(r.begin()+n*RO*E1));
+                memcpy(xsum.begin(), x.begin()+n*RO*E1*CHA, xsum.get_number_of_bytes());
+
+                for (size_t cha=1; cha<CHA; cha++ )
+                {
+                    hoNDArray<T> x2D(RO, E1, const_cast<T*>(x.begin()+n*RO*E1*CHA+cha*RO*E1));
+                    Gadgetron::add(x2D, xsum, xsum);
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOver3rdDimension(...) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> bool sumOver4thDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            size_t NDim = x.get_number_of_dimensions();
+
+            if ( NDim < 4 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+            size_t N = x.get_size(3);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*CHA*N);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimAve(*dim);
+            dimAve[3] = 1;
+            r.create(&dimAve);
+
+            int n;
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, CHA, N, num)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, CHA, N, num, x, r)
+            #endif
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> xsum(RO, E1, CHA, const_cast<T*>(r.begin()+n*RO*E1*CHA));
+                memcpy(xsum.begin(), x.begin()+n*RO*E1*CHA*N, xsum.get_number_of_bytes());
+
+                for (size_t nn=1; nn<N; nn++ )
+                {
+                    hoNDArray<T> x3D(RO, E1, CHA, const_cast<T*>(x.begin()+n*RO*E1*CHA*N+nn*RO*E1*CHA));
+                    Gadgetron::add(x3D, xsum, xsum);
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOver4thDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    template<typename T> bool sumOver5thDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            size_t NDim = x.get_number_of_dimensions();
+
+            if ( NDim < 5 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+            size_t N = x.get_size(3);
+            size_t S = x.get_size(4);
+
+            size_t num = x.get_number_of_elements()/(RO*E1*CHA*N*S);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimAve(*dim);
+            dimAve[4] = 1;
+            r.create(&dimAve);
+
+            int n;
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, CHA, N, S, num) if (num > 4)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(RO, E1, CHA, N, S, num, x, r) if (num > 4)
+            #endif
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> xsum(RO, E1, CHA, N, const_cast<T*>(r.begin()+n*RO*E1*CHA*N));
+                memcpy(xsum.begin(), x.begin()+n*RO*E1*CHA*N*S, xsum.get_number_of_bytes());
+
+                for (size_t s=1; s<S; s++ )
+                {
+                    hoNDArray<T> x4D(RO, E1, CHA, N, const_cast<T*>(x.begin()+n*RO*E1*CHA*N*S+s*RO*E1*CHA*N));
+                    Gadgetron::add(x4D, xsum, xsum);
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in sumOver5thDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    // e.g. x is 3D and y is 4D array, r(:,:,n,:) = y(:,:,n,:) .* x3D
+    template<typename T> 
+    bool multiplyOver3rdDimension(const hoNDArray<T>& x3D, const hoNDArray<T>& y4D, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x3D.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y4D.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()>=3);
+            GADGET_CHECK_RETURN_FALSE(NDim>=4);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[0]==(*dimY)[0]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[1]==(*dimY)[1]);
+
+            if ( !r.dimensions_equal(dimY.get()) )
+            {
+                r.create(dimY);
+            }
+
+            int t, N2D = x3D.get_size(0)*x3D.get_size(1);
+            int sz = y4D.get_size(2);
+            int st = y4D.get_number_of_elements()/(N2D*sz);
+
+            if ( sz == 1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(multiply(x3D, y4D, r));
+                return true;
+            }
+
+            const T* pX = x3D.begin();
+            const T* pY = y4D.begin();
+            T* pR = r.begin();
+
+            std::vector<size_t> dim2D(2);
+            dim2D[0] = (*dimY)[0];
+            dim2D[1] = (*dimY)[1];
+
+            #pragma omp parallel for default(none) private(t) shared(N2D, sz, st, dim2D, pX, pY, pR)
+            for ( t=0; t<st; t++ )
+            {
+                hoNDArray<T> tmpX, tmpY, tmpR;
+                tmpX.create(&dim2D, const_cast<T*>(pX+t*N2D));
+
+                for ( int z=0; z<sz; z++ )
+                {
+                    tmpY.create(&dim2D, const_cast<T*>(pY+t*N2D*sz+z*N2D));
+                    tmpR.create(&dim2D, pR+t*N2D*sz+z*N2D);
+                    multiply(tmpX, tmpY, tmpR);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in multiplyOver3rdDimension(const hoNDArray<float>& x3D, const hoNDArray<float>& y4D, hoNDArray<float>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool multiplyOver4thDimension(const hoNDArray<T>& x4D, const hoNDArray<T>& y5D, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x4D.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y5D.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()>=4);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[0]==(*dimY)[0]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[1]==(*dimY)[1]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[2]==(*dimY)[2]);
+
+            if ( !r.dimensions_equal(dimY.get()) )
+            {
+                r.create(dimY);
+            }
+
+            size_t RO = (*dimX)[0];
+            size_t E1 = (*dimX)[1];
+            size_t CHA = (*dimX)[2];
+
+            int t, N3D = RO*E1*CHA;
+
+            size_t N = (*dimY)[3];
+            size_t num = x4D.get_number_of_elements()/(RO*E1*CHA);
+
+            const T* pX = x4D.begin();
+            const T* pY = y5D.begin();
+            T* pR = r.begin();
+
+            std::vector<size_t> dim3D(3);
+            dim3D[0] = RO;
+            dim3D[1] = E1;
+            dim3D[2] = CHA;
+
+            #pragma omp parallel for default(none) private(t) shared(N3D, N, dim3D, pX, pY, pR, num)
+            for ( t=0; t<(int)num; t++ )
+            {
+                hoNDArray<T> tmpX, tmpY, tmpR;
+                tmpX.create(&dim3D, const_cast<T*>(pX+t*N3D));
+
+                for ( int n=0; n<N; n++ )
+                {
+                    tmpY.create(&dim3D, const_cast<T*>(pY+t*N3D*N+n*N3D));
+                    tmpR.create(&dim3D, pR+t*N3D*N+n*N3D);
+                    multiply(tmpX, tmpY, tmpR);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in multiplyOver4thDimension(const hoNDArray<float>& x4D, const hoNDArray<float>& y5D, hoNDArray<float>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool multiplyOver4thDimensionExcept(const hoNDArray<T>& x4D, const hoNDArray<T>& y5D, size_t n, hoNDArray<T>& r, bool copyY2R)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x4D.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y5D.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()>=4);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[0]==(*dimY)[0]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[1]==(*dimY)[1]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[2]==(*dimY)[2]);
+
+            const T* pX = x4D.begin();
+            const T* pY = y5D.begin();
+            T* pR = r.begin();
+
+            if ( (pR!=pY) && (!r.dimensions_equal(dimY.get())) )
+            {
+                r.create(dimY);
+                pR = r.begin();
+            }
+
+            size_t RO = (*dimX)[0];
+            size_t E1 = (*dimX)[1];
+            size_t CHA = (*dimX)[2];
+
+            int t, N3D = RO*E1*CHA;
+
+            size_t N = (*dimY)[3];
+            size_t num = x4D.get_number_of_elements()/(RO*E1*CHA);
+
+            std::vector<size_t> dim3D(3);
+            dim3D[0] = RO;
+            dim3D[1] = E1;
+            dim3D[2] = CHA;
+
+            #pragma omp parallel for default(none) private(t) shared(N3D, N, dim3D, pX, pY, pR, num, n, copyY2R)
+            for ( t=0; t<(int)num; t++ )
+            {
+                hoNDArray<T> tmpX, tmpY, tmpR;
+                tmpX.create(&dim3D, const_cast<T*>(pX+t*N3D));
+
+                for ( int z=0; z<N; z++ )
+                {
+                    if ( z != n )
+                    {
+                        tmpY.create(&dim3D, const_cast<T*>(pY+t*N3D*N+z*N3D));
+                        tmpR.create(&dim3D, pR+t*N3D*N+z*N3D);
+                        multiply(tmpX, tmpY, tmpR);
+                    }
+                    else
+                    {
+                        if ( pR != pY )
+                        {
+                            if ( copyY2R )
+                            {
+                                memcpy(pR+t*N3D*N+z*N3D, const_cast<T*>(pY+t*N3D*N+z*N3D), sizeof(T)*N3D);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in multiplyOver4thDimensionExcept(const hoNDArray<float>& x4D, const hoNDArray<float>& y5D, size_t n, hoNDArray<float>& r, bool copyY2R) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool multiplyOver5thDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()>=5);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[0]==(*dimY)[0]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[1]==(*dimY)[1]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[2]==(*dimY)[2]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[3]==(*dimY)[3]);
+
+            if ( !r.dimensions_equal(dimY.get()) )
+            {
+                r.create(dimY);
+            }
+
+            size_t RO = (*dimX)[0];
+            size_t E1 = (*dimX)[1];
+            size_t E2 = (*dimX)[2];
+            size_t CHA = (*dimX)[3];
+
+            int t, N4D = RO*E1*E2*CHA;
+
+            size_t N = (*dimY)[4];
+            size_t num = x.get_number_of_elements()/N4D;
+
+            const T* pX = x.begin();
+            const T* pY = y.begin();
+            T* pR = r.begin();
+
+            std::vector<size_t> dim4D(4);
+            dim4D[0] = RO;
+            dim4D[1] = E1;
+            dim4D[2] = E2;
+            dim4D[3] = CHA;
+
+            #pragma omp parallel for default(none) private(t) shared(N4D, N, dim4D, pX, pY, pR, num)
+            for ( t=0; t<(int)num; t++ )
+            {
+                hoNDArray<T> tmpX, tmpY, tmpR;
+                tmpX.create(&dim4D, const_cast<T*>(pX+t*N4D));
+
+                for ( int n=0; n<N; n++ )
+                {
+                    tmpY.create(&dim4D, const_cast<T*>(pY+t*N4D*N+n*N4D));
+                    tmpR.create(&dim4D, pR+t*N4D*N+n*N4D);
+                    multiply(tmpX, tmpY, tmpR);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in multiplyOver5thDimension(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool multiplyOver5thDimensionExcept(const hoNDArray<T>& x, const hoNDArray<T>& y, size_t n, hoNDArray<T>& r, bool copyY2R)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+            boost::shared_ptr< std::vector<size_t> > dimY = y.get_dimensions();
+
+            size_t NDim = dimY->size();
+
+            GADGET_CHECK_RETURN_FALSE(dimX->size()>=5);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[0]==(*dimY)[0]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[1]==(*dimY)[1]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[2]==(*dimY)[2]);
+            GADGET_CHECK_RETURN_FALSE((*dimX)[3]==(*dimY)[3]);
+
+            const T* pX = x.begin();
+            const T* pY = y.begin();
+            T* pR = r.begin();
+
+            if ( (pR!=pY) && (!r.dimensions_equal(dimY.get())) )
+            {
+                r.create(dimY);
+                pR = r.begin();
+            }
+
+            size_t RO = (*dimX)[0];
+            size_t E1 = (*dimX)[1];
+            size_t E2 = (*dimX)[2];
+            size_t CHA = (*dimX)[3];
+
+            int t, N4D = RO*E1*E2*CHA;
+
+            size_t N = (*dimY)[4];
+            size_t num = x.get_number_of_elements()/N4D;
+
+            std::vector<size_t> dim4D(4);
+            dim4D[0] = RO;
+            dim4D[1] = E1;
+            dim4D[2] = E2;
+            dim4D[3] = CHA;
+
+            #pragma omp parallel for default(none) private(t) shared(N4D, dim4D, pX, pY, pR, num, n, N, copyY2R)
+            for ( t=0; t<(int)num; t++ )
+            {
+                hoNDArray<T> tmpX, tmpY, tmpR;
+                tmpX.create(&dim4D, const_cast<T*>(pX+t*N4D));
+
+                for ( int z=0; z<N; z++ )
+                {
+                    if ( z != n )
+                    {
+                        tmpY.create(&dim4D, const_cast<T*>(pY+t*N4D*N+z*N4D));
+                        tmpR.create(&dim4D, pR+t*N4D*N+z*N4D);
+                        multiply(tmpX, tmpY, tmpR);
+                    }
+                    else
+                    {
+                        if ( pR != pY )
+                        {
+                            if ( copyY2R )
+                            {
+                                memcpy(pR+t*N4D*N+z*N4D, const_cast<T*>(pY+t*N4D*N+z*N4D), sizeof(T)*N4D);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in multiplyOver5thDimensionExcept(const hoNDArray<T>& x, const hoNDArray<T>& y, size_t n, hoNDArray<T>& r, bool copyY2R) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template <typename T> 
+    bool multipleAdd(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()<=y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=y.get_number_of_elements())
+        {
+            r = y;
+        }
+
+        int Nx = x.get_number_of_elements();
+        int N = y.get_number_of_elements() / Nx;
+
+        int n;
+
+        if ( typeid(T)==typeid(float) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vsAdd(x.get_number_of_elements(), reinterpret_cast<const float*>(x.begin()), reinterpret_cast<const float*>(y.begin()+n*Nx), reinterpret_cast<float*>(r.begin()+n*Nx));
+            }
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vdAdd(x.get_number_of_elements(), reinterpret_cast<const double*>(x.begin()), reinterpret_cast<const double*>(y.begin()+n*Nx), reinterpret_cast<double*>(r.begin()+n*Nx));
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vcAdd(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()+n*Nx), reinterpret_cast<MKL_Complex8*>(r.begin()+n*Nx));
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vzAdd(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()+n*Nx), reinterpret_cast<MKL_Complex16*>(r.begin()+n*Nx));
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("multipleAdd : unsupported type " << typeid(T).name());
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    template <typename T> 
+    bool multipleMultiply(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r)
+    {
+        GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_elements()<=y.get_number_of_elements());
+        if ( r.get_number_of_elements()!=y.get_number_of_elements())
+        {
+            r = y;
+        }
+
+        int Nx = x.get_number_of_elements();
+        int N = y.get_number_of_elements() / Nx;
+
+        int n;
+
+        if ( typeid(T)==typeid(float) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vsMul(x.get_number_of_elements(), reinterpret_cast<const float*>(x.begin()), reinterpret_cast<const float*>(y.begin()+n*Nx), reinterpret_cast<float*>(r.begin()+n*Nx));
+            }
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vdMul(x.get_number_of_elements(), reinterpret_cast<const double*>(x.begin()), reinterpret_cast<const double*>(y.begin()+n*Nx), reinterpret_cast<double*>(r.begin()+n*Nx));
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vcMul(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex8*>(x.begin()), reinterpret_cast<const MKL_Complex8*>(y.begin()+n*Nx), reinterpret_cast<MKL_Complex8*>(r.begin()+n*Nx));
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(Nx, N)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(x, y, r, Nx, N)
+            #endif
+            for ( n=0; n<N; n++ )
+            {
+                vzMul(x.get_number_of_elements(), reinterpret_cast<const MKL_Complex16*>(x.begin()), reinterpret_cast<const MKL_Complex16*>(y.begin()+n*Nx), reinterpret_cast<MKL_Complex16*>(r.begin()+n*Nx));
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("multipleMultiply : unsupported type " << typeid(T).name());
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(vmlGetErrStatus()==0);
+
+        return true;
+    }
+
+    template <typename T> 
+    bool cropUpTo10DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size)
+    {
+        GADGET_CHECK_RETURN_FALSE( startND.size() == size.size() );
+        GADGET_CHECK_RETURN_FALSE( startND.size() <= 10 );
+
+        r.create(&size);
+        if ( r.get_number_of_elements() == x.get_number_of_elements() )
+        {
+            r = x;
+            return true;
+        }
+
+        std::vector<size_t> start(10, 0);
+        std::vector<size_t> end(10, 0);
+
+        size_t ii;
+        for ( ii=0; ii<startND.size(); ii++ )
+        {
+            start[ii] = startND[ii];
+            end[ii] = start[ii] + size[ii] - 1;
+            GADGET_CHECK_RETURN_FALSE(end[ii] < x.get_size(ii));
+        }
+
+        // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+        size_t ro, e1, cha, n, s, con, phs, rep, set, seg;
+
+        std::vector<size_t> srcInd(10), dstInd(10);
+
+        for ( seg=start[9]; seg<=end[9]; seg++ )
+        {
+            srcInd[9] = seg; dstInd[9] = seg-start[9];
+
+            for ( set=start[8]; set<=end[8]; set++ )
+            {
+                srcInd[8] = set; dstInd[8] = set-start[8];
+
+                for ( rep=start[7]; rep<=end[7]; rep++ )
+                {
+                    srcInd[7] = rep; dstInd[7] = rep-start[7];
+
+                    for ( phs=start[6]; phs<=end[6]; phs++ )
+                    {
+                        srcInd[6] = phs; dstInd[6] = phs-start[6];
+
+                        for ( con=start[5]; con<=end[5]; con++ )
+                        {
+                            srcInd[5] = con; dstInd[5] = con-start[5];
+
+                            for ( s=start[4]; s<=end[4]; s++ )
+                            {
+                                srcInd[4] = s; dstInd[4] = s-start[4];
+
+                                for ( n=start[3]; n<=end[3]; n++ )
+                                {
+                                    srcInd[3] = n; dstInd[3] = n-start[3];
+
+                                    for ( cha=start[2]; cha<=end[2]; cha++ )
+                                    {
+                                        srcInd[2] = cha; dstInd[2] = cha-start[2];
+
+                                        for ( e1=start[1]; e1<=end[1]; e1++ )
+                                        {
+                                            srcInd[1] = e1; dstInd[1] = e1-start[1];
+
+                                            srcInd[0] = start[0];
+                                            dstInd[0] = 0;
+
+                                            int offsetSrc = x.calculate_offset(srcInd);
+                                            int offsetDst = r.calculate_offset(dstInd);
+
+                                            memcpy(r.begin()+offsetDst, x.begin()+offsetSrc, sizeof(T)*(end[0]-start[0]+1));
+
+                                            /*for ( ro=start[0]; ro<=end[0]; ro++ )
+                                            {
+                                                srcInd[0] = ro;
+                                                dstInd[0] = ro-start[0];
+
+                                                int offsetSrc = x.calculate_offset(srcInd);
+                                                int offsetDst = r.calculate_offset(dstInd);
+
+                                                r(offsetDst) = x(offsetSrc);
+                                            }*/
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    template <typename T> 
+    bool setSubArrayUpTo10DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size)
+    {
+        GADGET_CHECK_RETURN_FALSE( startND.size() == size.size() );
+        GADGET_CHECK_RETURN_FALSE( startND.size() <= 10 );
+
+        if ( r.get_number_of_elements() == x.get_number_of_elements() )
+        {
+            r = x;
+            return true;
+        }
+
+        std::vector<size_t> start(10, 0);
+        std::vector<size_t> end(10, 0);
+
+        size_t ii;
+        for ( ii=0; ii<startND.size(); ii++ )
+        {
+            start[ii] = startND[ii];
+            end[ii] = start[ii] + size[ii] - 1;
+            GADGET_CHECK_RETURN_FALSE(end[ii] < r.get_size(ii));
+        }
+
+        // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+        size_t ro, e1, cha, n, s, con, phs, rep, set, seg;
+
+        std::vector<size_t> srcInd(10), dstInd(10);
+
+        for ( seg=start[9]; seg<=end[9]; seg++ )
+        {
+            dstInd[9] = seg; srcInd[9] = seg-start[9];
+
+            for ( set=start[8]; set<=end[8]; set++ )
+            {
+                dstInd[8] = set; srcInd[8] = set-start[8];
+
+                for ( rep=start[7]; rep<=end[7]; rep++ )
+                {
+                    dstInd[7] = rep; srcInd[7] = rep-start[7];
+
+                    for ( phs=start[6]; phs<=end[6]; phs++ )
+                    {
+                        dstInd[6] = phs; srcInd[6] = phs-start[6];
+
+                        for ( con=start[5]; con<=end[5]; con++ )
+                        {
+                            dstInd[5] = con; srcInd[5] = con-start[5];
+
+                            for ( s=start[4]; s<=end[4]; s++ )
+                            {
+                                dstInd[4] = s; srcInd[4] = s-start[4];
+
+                                for ( n=start[3]; n<=end[3]; n++ )
+                                {
+                                    dstInd[3] = n; srcInd[3] = n-start[3];
+
+                                    for ( cha=start[2]; cha<=end[2]; cha++ )
+                                    {
+                                        dstInd[2] = cha; srcInd[2] = cha-start[2];
+
+                                        for ( e1=start[1]; e1<=end[1]; e1++ )
+                                        {
+                                            dstInd[1] = e1; srcInd[1] = e1-start[1];
+
+                                            dstInd[0] = start[0];
+                                            srcInd[0] = 0;
+
+                                            int offsetSrc = x.calculate_offset(srcInd);
+                                            int offsetDst = r.calculate_offset(dstInd);
+
+                                            memcpy(r.begin()+offsetDst, x.begin()+offsetSrc, sizeof(T)*(end[0]-start[0]+1));
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    template<typename T> 
+    bool stdOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& std, bool NMinusOne)
+    {
+        try
+        {
+            GADGET_DEBUG_CHECK_RETURN_FALSE(x.get_number_of_dimensions() >= 3);
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+
+            int num = (int)x.get_number_of_elements() / (RO*E1*CHA);
+
+            boost::shared_ptr< std::vector<size_t> > dim = x.get_dimensions();
+
+            std::vector<size_t> dimStd(*dim);
+            dimStd.erase(dimStd.begin()+2);
+            std.create(&dimStd);
+
+            std::vector<size_t> dim3D(3);
+            dim3D[0] = RO;
+            dim3D[1] = E1;
+            dim3D[2] = CHA;
+
+            T S(CHA);
+            if ( NMinusOne )
+            {
+                S = T(CHA-1);
+            }
+
+            T v(0), v1(0);
+            T S2 = T(1.0)/S;
+            T S3 = T(1.0)/T(CHA);
+
+            int n;
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel for default(none) private(n) shared(num, RO, E1, CHA, S, S2, S3, v, v1)
+            #else
+                #pragma omp parallel for default(none) private(n) shared(num, RO, E1, CHA, x, std, S, S2, S3, v, v1)
+            #endif
+            for ( n=0; n<num; n++ )
+            {
+                hoNDArray<T> xTmp(RO, E1, CHA, const_cast<T*>(x.begin()+n*RO*E1*CHA));
+                hoNDArray<T> mean(RO, E1);
+
+                size_t ro, e1, cha;
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    for ( e1=0; e1<E1; e1++ )
+                    {
+                        for ( ro=0; ro<RO; ro++ )
+                        {
+                            mean(ro+e1*RO) += xTmp(cha*RO*E1+e1*RO+ro)*S3;
+                        }
+                    }
+                }
+
+                for ( e1=0; e1<E1; e1++ )
+                {
+                    for ( ro=0; ro<RO; ro++ )
+                    {
+                        int ind = e1*RO+ro;
+
+                        v = 0; v1 = 0;
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            v1 = std::abs(xTmp(cha*RO*E1+ind)-mean(ind));
+                            v += v1*v1;
+                        }
+
+                        v /= S;
+                        std(ind+n*RO*E1) = std::sqrt(v);
+                    }
+                }
+            }
+        }
+        catch(...)
+        {
+            GADGET_ERROR_MSG("Errors in stdOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& std, bool NMinusOne) ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    /*template<typename T> 
+    bool permuteLastTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim <= 2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t E1 = x.get_size(NDim-2);
+            size_t E2 = x.get_size(NDim-1);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[NDim-2] = E2;
+            dimR[NDim-1] = E1;
+
+            r.create(&dimR);
+
+            size_t N = x.get_number_of_elements()/E1/E2;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            int e2;
+
+            #pragma omp parallel for default(none) private(e2) shared(E2, E1, pR, pX, N)
+            for ( e2=0; e2<(int)E2; e2++ )
+            {
+                for ( size_t e1=0; e1<E1; e1++ )
+                {
+                    memcpy(pR+e1*N*E2+e2*N, pX+e2*N*E1+e1*N, sizeof(T)*N);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteLastTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }*/
+
+    template<typename T> 
+    bool cropOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim <= 2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            size_t E2_R = end-start+1;
+
+            if ( E2 <= E2_R )
+            {
+                r = x;
+                return true;
+            }
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[2] = E2_R;
+
+            r.create(&dimR);
+
+            size_t N2D = RO*E1;
+            size_t N3D = RO*E1*E2;
+            size_t N3D_R = RO*E1*E2_R;
+
+            size_t N = x.get_number_of_elements()/N3D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            size_t n;
+            for ( n=0; n<N; n++ )
+            {
+                int e2;
+                #pragma omp parallel for default(none) private(e2) shared(N2D, N3D, N3D_R, pX, pR, RO, E1, E2, n, start, end)
+                for ( e2=start; e2<=end; e2++ )
+                {
+                    memcpy(pR+n*N3D_R+(e2-start)*N2D, pX+n*N3D+e2*N2D, sizeof(T)*N2D);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in cropOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> bool setSubArrayOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimR = r.get_dimensions();
+
+            size_t NDim = dimR->size();
+
+            if ( NDim <= 2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = r.get_size(0);
+            size_t E1 = r.get_size(1);
+            size_t E2 = r.get_size(2);
+
+            size_t E2_X = end-start+1;
+            GADGET_CHECK_RETURN_FALSE( E2_X == x.get_size(2) );
+
+            if ( E2_X >= E2 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t N2D = RO*E1;
+            size_t N3D = RO*E1*E2;
+            size_t N3D_X = RO*E1*E2_X;
+
+            size_t N = r.get_number_of_elements()/N3D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            size_t n;
+            for ( n=0; n<N; n++ )
+            {
+                int e2;
+                #pragma omp parallel for default(none) private(e2) shared(N2D, N3D, N3D_X, pX, pR, RO, E1, E2, n, start, end)
+                for ( e2=start; e2<=end; e2++ )
+                {
+                    memcpy(pR+n*N3D+e2*N2D, pX+n*N3D_X+(e2-start)*N2D, sizeof(T)*N2D);
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in setSubArrayOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permuteE2To3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim <= 5 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+            size_t SLC = x.get_size(3);
+            size_t E2 = x.get_size(4);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[2] = E2;
+            dimR[3] = CHA;
+            dimR[4] = SLC;
+
+            r.create(&dimR);
+
+            size_t N2D = RO*E1;
+            size_t N5D = RO*E1*CHA*E2*SLC;
+
+            size_t N = x.get_number_of_elements()/N5D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            size_t n;
+            for ( n=0; n<N; n++ )
+            {
+                int e2;
+                #pragma omp parallel for default(none) private(e2) shared(N5D, N2D, pX, pR, CHA, SLC, E2, n)
+                for ( e2=0; e2<E2; e2++ )
+                {
+                    for ( size_t slc=0; slc<SLC; slc++ )
+                    {
+                        for ( size_t cha=0; cha<CHA; cha++ )
+                        {
+                            memcpy(pR+n*N5D+slc*CHA*E2*N2D+cha*E2*N2D+e2*N2D, pX+n*N5D+e2*SLC*CHA*N2D+slc*CHA*N2D+cha*N2D, sizeof(T)*N2D);
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteE2To3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permuteE2To5thDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim < 5 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+            size_t CHA = x.get_size(3);
+            size_t SLC = x.get_size(4);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[2] = CHA;
+            dimR[3] = SLC;
+            dimR[4] = E2;
+
+            r.create(&dimR);
+
+            size_t N2D = RO*E1;
+            size_t N5D = RO*E1*CHA*E2*SLC;
+
+            size_t N = x.get_number_of_elements()/N5D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            size_t n;
+            for ( n=0; n<N; n++ )
+            {
+                int e2;
+                #pragma omp parallel for default(none) private(e2) shared(N5D, N2D, pX, pR, CHA, SLC, E2, n)
+                for ( e2=0; e2<E2; e2++ )
+                {
+                    for ( size_t slc=0; slc<SLC; slc++ )
+                    {
+                        for ( size_t cha=0; cha<CHA; cha++ )
+                        {
+                            memcpy(pR+n*N5D+e2*SLC*CHA*N2D+slc*CHA*N2D+cha*N2D, pX+n*N5D+slc*CHA*E2*N2D+cha*E2*N2D+e2*N2D, sizeof(T)*N2D);
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteE2To5thDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permuteROTo3rdDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim < 3 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[0] = E1;
+            dimR[1] = E2;
+            dimR[2] = RO;
+
+            r.create(&dimR);
+
+            size_t N3D = RO*E1*E2;
+
+            size_t N = x.get_number_of_elements()/N3D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            long long n;
+
+            #pragma omp parallel for default(none) private(n) shared(RO, E1, E2, N, pR, N3D, pX)
+            for ( n=0; n<(long long)N; n++ )
+            {
+                T* pRn = pR + n*N3D;
+                T* pXn = const_cast<T*>(pX) + n*N3D;
+
+                for ( size_t e2=0; e2<E2; e2++ )
+                {
+                    for ( size_t e1=0; e1<E1; e1++ )
+                    {
+                        for ( size_t ro=0; ro<RO; ro++ )
+                        {
+                            pRn[e1+e2*E1+ro*E1*E2] = pXn[ro+e1*RO+e2*RO*E1];
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteROTo3rdDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permuteROTo4thDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim < 4 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+            size_t CHA = x.get_size(3);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[0] = E1;
+            dimR[1] = E2;
+            dimR[2] = CHA;
+            dimR[3] = RO;
+
+            r.create(&dimR);
+
+            size_t N4D = RO*E1*E2*CHA;
+
+            size_t N = x.get_number_of_elements()/N4D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            long long n;
+            for ( n=0; n<(long long)N; n++ )
+            {
+                T* pRn = pR + n*N4D;
+                T* pXn = const_cast<T*>(pX) + n*N4D;
+
+                long long cha;
+
+                #pragma omp parallel for default(none) private(cha) shared(RO, E1, E2, CHA, pXn, pRn)
+                for ( cha=0; cha<(long long)CHA; cha++ )
+                {
+                    for ( size_t e2=0; e2<E2; e2++ )
+                    {
+                        for ( size_t e1=0; e1<E1; e1++ )
+                        {
+                            for ( size_t ro=0; ro<RO; ro++ )
+                            {
+                                pRn[e1+e2*E1+cha*E1*E2+ro*E1*E2*CHA] = pXn[ro+e1*RO+e2*RO*E1+cha*RO*E1*E2];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteROTo4thDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permuteROTo1stDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim < 4 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t E1 = x.get_size(0);
+            size_t E2 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+            size_t RO = x.get_size(3);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[0] = RO;
+            dimR[1] = E1;
+            dimR[2] = E2;
+            dimR[3] = CHA;
+
+            r.create(&dimR);
+
+            size_t N4D = RO*E1*E2*CHA;
+
+            size_t N = x.get_number_of_elements()/N4D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            long long n;
+            for ( n=0; n<(long long)N; n++ )
+            {
+                T* pRn = pR + n*N4D;
+                T* pXn = const_cast<T*>(pX) + n*N4D;
+
+                long long cha;
+
+                #pragma omp parallel for default(none) private(cha) shared(RO, E1, E2, CHA, pXn, pRn)
+                for ( cha=0; cha<(long long)CHA; cha++ )
+                {
+                    for ( size_t e2=0; e2<E2; e2++ )
+                    {
+                        for ( size_t e1=0; e1<E1; e1++ )
+                        {
+                            size_t indRn = e1*RO+e2*RO*E1+cha*RO*E1*E2;
+                            size_t indXn = e1+e2*E1+cha*E1*E2;
+                            for ( size_t ro=0; ro<RO; ro++ )
+                            {
+                                pRn[ro+indRn] = pXn[indXn+ro*E1*E2*CHA];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteROTo1stDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permute3rdDimensionTo1stDimension(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim < 3 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[0] = E2;
+            dimR[1] = RO;
+            dimR[2] = E1;
+
+            r.create(&dimR);
+
+            size_t N3D = RO*E1*E2;
+
+            size_t N = x.get_number_of_elements()/N3D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            long long n, e2;
+            for ( n=0; n<(long long)N; n++ )
+            {
+                T* pRn = pR + n*N3D;
+                T* pXn = const_cast<T*>(pX) + n*N3D;
+
+                #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, pXn, pRn)
+                for ( e2=0; e2<(long long)E2; e2++ )
+                {
+                    for ( size_t e1=0; e1<E1; e1++ )
+                    {
+                        size_t indRn = e2+e1*E2*RO;
+                        size_t indXn = e1*RO+e2*RO*E1;
+                        for ( size_t ro=0; ro<RO; ro++ )
+                        {
+                            pRn[ro*E2+indRn] = pXn[ro+indXn];
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permute3rdDimensionTo1stDimension(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool permuteROTo5thDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r)
+    {
+        try
+        {
+            boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+            size_t NDim = dimX->size();
+
+            if ( NDim < 5 )
+            {
+                r = x;
+                return true;
+            }
+
+            size_t RO = x.get_size(0);
+            size_t E1 = x.get_size(1);
+            size_t E2 = x.get_size(2);
+            size_t srcCHA = x.get_size(3);
+            size_t dstCHA = x.get_size(4);
+
+            std::vector<size_t> dimR(*dimX);
+            dimR[0] = E1;
+            dimR[1] = E2;
+            dimR[2] = srcCHA;
+            dimR[3] = dstCHA;
+            dimR[4] = RO;
+
+            r.create(&dimR);
+
+            size_t N5D = RO*E1*E2*srcCHA*dstCHA;
+
+            size_t N = x.get_number_of_elements()/N5D;
+
+            const T* pX = x.begin();
+            T* pR = r.begin();
+
+            long long n;
+            for ( n=0; n<(long long)N; n++ )
+            {
+                T* pRn = pR + n*N5D;
+                T* pXn = const_cast<T*>(pX) + n*N5D;
+
+                long long dcha;
+
+                #pragma omp parallel for default(none) private(dcha) shared(RO, E1, E2, srcCHA, dstCHA, pXn, pRn)
+                for ( dcha=0; dcha<(long long)dstCHA; dcha++ )
+                {
+                    for ( size_t scha=0; scha<(int)srcCHA; scha++ )
+                    {
+                        for ( size_t e2=0; e2<E2; e2++ )
+                        {
+                            for ( size_t e1=0; e1<E1; e1++ )
+                            {
+                                size_t indRn = e1+e2*E1+scha*E1*E2+dcha*E1*E2*srcCHA;
+                                size_t indXn = e1*RO+e2*RO*E1+scha*RO*E1*E2+dcha*RO*E1*E2*srcCHA;
+                                for ( size_t ro=0; ro<RO; ro++ )
+                                {
+                                    pRn[indRn+ro*E1*E2*srcCHA*dstCHA] = pXn[ro+indXn];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in permuteROTo5thDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool imageDomainUnwrapping2D(const hoNDArray<T>& x, const hoNDArray<T>& kernel, hoNDArray<T>& buf, hoNDArray<T>& y)
+    {
+        try
+        {
+            T* pX = const_cast<T*>(x.begin());
+            T* ker = const_cast<T*>(kernel.begin());
+            T* pY = y.begin();
+
+            size_t ro = x.get_size(0);
+            size_t e1 = x.get_size(1);
+            size_t srcCHA = x.get_size(2);
+            size_t dstCHA = kernel.get_size(3);
+
+            if ( buf.get_number_of_elements() < ro*e1*srcCHA )
+            {
+                buf.create(ro, e1, srcCHA);
+            }
+            T* pBuf = buf.begin();
+
+            long long dCha;
+
+            //#pragma omp parallel default(shared)
+            {
+                //#ifdef WIN32
+                //    int tid = omp_get_thread_num();
+                //    DWORD_PTR mask = (1 << tid);
+                //    // GADGET_MSG("thread id : " << tid << " - mask : " << mask);
+                //    SetThreadAffinityMask( GetCurrentThread(), mask );
+                //#endif // WIN32
+
+                //#pragma omp for
+
+                if ( typeid(T)==typeid(GT_Complex8) )
+                {
+                    for ( dCha=0; dCha<dstCHA; dCha++ )
+                    {
+                        vcMul(ro*e1*srcCHA, reinterpret_cast<MKL_Complex8*>(pX), 
+                            reinterpret_cast<MKL_Complex8*>(ker+dCha*ro*e1*srcCHA), 
+                            reinterpret_cast<MKL_Complex8*>(pBuf));
+
+                        memcpy(pY+dCha*ro*e1, pBuf, sizeof(T)*ro*e1);
+                        for ( size_t sCha=1; sCha<srcCHA; sCha++ )
+                        {
+                            vcAdd(ro*e1, reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1), 
+                                reinterpret_cast<MKL_Complex8*>(pBuf+sCha*ro*e1), 
+                                reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1));
+                        }
+                    }
+                }
+                else if ( typeid(T)==typeid(GT_Complex16) )
+                {
+                    for ( dCha=0; dCha<dstCHA; dCha++ )
+                    {
+                        vzMul(ro*e1*srcCHA, reinterpret_cast<MKL_Complex16*>(pX), 
+                            reinterpret_cast<MKL_Complex16*>(ker+dCha*ro*e1*srcCHA), 
+                            reinterpret_cast<MKL_Complex16*>(pBuf));
+
+                        memcpy(pY+dCha*ro*e1, pBuf, sizeof(T)*ro*e1);
+                        for ( size_t sCha=1; sCha<srcCHA; sCha++ )
+                        {
+                            vzAdd(ro*e1, reinterpret_cast<MKL_Complex16*>(pY+dCha*ro*e1), 
+                                reinterpret_cast<MKL_Complex16*>(pBuf+sCha*ro*e1), 
+                                reinterpret_cast<MKL_Complex16*>(pY+dCha*ro*e1));
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in imageDomainUnwrapping2D(const hoNDArray<T>& x, const hoNDArray<T>& ker, hoNDArray<T>& buf, hoNDArray<T>& y) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template<typename T> 
+    bool imageDomainUnwrapping2DT(const hoNDArray<T>& x, const hoNDArray<T>& kernel, hoNDArray<T>& buf, hoNDArray<T>& y)
+    {
+        try
+        {
+            size_t ro = x.get_size(0);
+            size_t e1 = x.get_size(1);
+            size_t srcCHA = x.get_size(2);
+            size_t N = x.get_size(3);
+
+            size_t dstCHA = kernel.get_size(3);
+            size_t kerN = kernel.get_size(4);
+
+            if ( buf.get_number_of_elements() < ro*e1*srcCHA )
+            {
+                buf.create(ro, e1, srcCHA);
+            }
+            T* pBuf = buf.begin();
+
+            long long n, dCha;
+
+            //#pragma omp parallel default(shared)
+            {
+                //#ifdef WIN32
+                //    int tid = omp_get_thread_num();
+                //    DWORD_PTR mask = (1 << tid);
+                //    // GADGET_MSG("thread id : " << tid << " - mask : " << mask);
+                //    SetThreadAffinityMask( GetCurrentThread(), mask );
+                //#endif // WIN32
+
+                //#pragma omp for
+
+                if ( typeid(T)==typeid(GT_Complex8) )
+                {
+                    const T* pXN = x.begin();
+                    T* pYN = y.begin();
+                    T* pBufN = buf.begin();
+                    const T* pKerN = kernel.begin();
+
+                    omp_set_nested(1);
+
+                    //#pragma omp parallel for default(none) private(n) shared(N, ro, e1, srcCHA, dstCHA, kerN, pXN, pYN, pBufN, pKerN)
+                    //for ( n=0; n<N; n++ )
+                    //{
+                    //    const T* ker = pKerN + n*ro*e1*srcCHA*dstCHA;
+                    //    if ( kerN <= n )
+                    //    {
+                    //        ker = pKerN + (kerN-1)*ro*e1*srcCHA*dstCHA;
+                    //    }
+
+                    //    const T* pX = pXN + n*ro*e1*srcCHA;
+                    //    T* pY = pYN + n*ro*e1*dstCHA;
+                    //    T* pBuf =pBufN + n*ro*e1*srcCHA;
+
+                    //    for ( size_t dCha=0; dCha<dstCHA; dCha++ )
+                    //    {
+                    //        vcMul(ro*e1*srcCHA, reinterpret_cast<const MKL_Complex8*>(pX), 
+                    //            reinterpret_cast<const MKL_Complex8*>(ker+dCha*ro*e1*srcCHA), 
+                    //            reinterpret_cast<MKL_Complex8*>(pBuf));
+
+                    //        memcpy(pY+dCha*ro*e1, pBuf, sizeof(T)*ro*e1);
+                    //        for ( size_t sCha=1; sCha<srcCHA; sCha++ )
+                    //        {
+                    //            vcAdd(ro*e1, reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1), 
+                    //                reinterpret_cast<MKL_Complex8*>(pBuf+sCha*ro*e1), 
+                    //                reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1));
+                    //        }
+                    //    }
+                    //}
+
+                    // #pragma omp parallel for default(none) private(dCha, n) shared(N, ro, e1, srcCHA, dstCHA, kerN, pXN, pYN, pBufN, pKerN)
+                    for ( dCha=0; dCha<(long long)dstCHA; dCha++ )
+                    {
+                        for ( n=0; n<N; n++  )
+                        {
+                            const T* ker = pKerN + n*ro*e1*srcCHA*dstCHA;
+                            if ( kerN <= n )
+                            {
+                                ker = pKerN + (kerN-1)*ro*e1*srcCHA*dstCHA;
+                            }
+
+                            const T* pX = pXN + n*ro*e1*srcCHA;
+                            T* pBuf =pBufN + n*ro*e1*srcCHA;
+
+                            vcMul(ro*e1*srcCHA, reinterpret_cast<const MKL_Complex8*>(pX), 
+                                reinterpret_cast<const MKL_Complex8*>(ker+dCha*ro*e1*srcCHA), 
+                                reinterpret_cast<MKL_Complex8*>(pBuf));
+                        //}
+
+                        //for ( n=0; n<N; n++  )
+                        //{
+                            T* pY = pYN + n*ro*e1*dstCHA;
+                            //T* pBuf =pBufN + n*ro*e1*srcCHA;
+
+                            memcpy(pY+dCha*ro*e1, pBuf, sizeof(T)*ro*e1);
+                            for ( size_t sCha=1; sCha<srcCHA; sCha++ )
+                            {
+                                vcAdd(ro*e1, reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1), 
+                                    reinterpret_cast<MKL_Complex8*>(pBuf+sCha*ro*e1), 
+                                    reinterpret_cast<MKL_Complex8*>(pY+dCha*ro*e1));
+                            }
+                        }
+                    }
+                }
+                else if ( typeid(T)==typeid(GT_Complex16) )
+                {
+                    for ( n=0; n<N; n++ )
+                    {
+                        const T* ker = kernel.begin() + n*ro*e1*srcCHA*dstCHA;
+                        if ( kerN <= n )
+                        {
+                            ker = kernel.begin() + (kerN-1)*ro*e1*srcCHA*dstCHA;
+                        }
+
+                        const T* pX = x.begin() + n*ro*e1*srcCHA;
+                        T* pY = y.begin() + n*ro*e1*dstCHA;
+
+                        for ( size_t dCha=0; dCha<dstCHA; dCha++ )
+                        {
+                            vzMul(ro*e1*srcCHA, reinterpret_cast<const MKL_Complex16*>(pX), 
+                                reinterpret_cast<const MKL_Complex16*>(ker+dCha*ro*e1*srcCHA), 
+                                reinterpret_cast<MKL_Complex16*>(pBuf));
+
+                            memcpy(pY+dCha*ro*e1, pBuf, sizeof(T)*ro*e1);
+                            for ( size_t sCha=1; sCha<srcCHA; sCha++ )
+                            {
+                                vzAdd(ro*e1, reinterpret_cast<MKL_Complex16*>(pY+dCha*ro*e1), 
+                                    reinterpret_cast<MKL_Complex16*>(pBuf+sCha*ro*e1), 
+                                    reinterpret_cast<MKL_Complex16*>(pY+dCha*ro*e1));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        catch (...)
+        {
+            GADGET_ERROR_MSG("Errors in imageDomainUnwrapping2DT(const hoNDArray<T>& x, const hoNDArray<T>& ker, hoNDArray<T>& buf, hoNDArray<T>& y) ... ");
+            return false;
+        }
+        return true;
+    }
+
+    template EXPORTCPUCOREMATH bool sumOverLastDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOverLastDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOverLastDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOverLastDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool sumOverSecondLastDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOverSecondLastDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOverSecondLastDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOverSecondLastDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool multiplyOverLastDimension(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool multiplyOverLastDimension(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool multiplyOverLastDimension(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool multiplyOverLastDimension(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool divideOverLastDimension(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool divideOverLastDimension(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool divideOverLastDimension(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool divideOverLastDimension(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool sumOver1stDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOver1stDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOver1stDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOver1stDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool sumOver2ndDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOver2ndDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOver2ndDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOver2ndDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool sumOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOver3rdDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOver3rdDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool sumOver4thDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOver4thDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOver4thDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOver4thDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool sumOver5thDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool sumOver5thDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool sumOver5thDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool sumOver5thDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool multiplyOver3rdDimension(const hoNDArray<float>& x3D, const hoNDArray<float>& y4D, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver3rdDimension(const hoNDArray<double>& x3D, const hoNDArray<double>& y4D, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver3rdDimension(const hoNDArray<GT_Complex8>& x3D, const hoNDArray<GT_Complex8>& y4D, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver3rdDimension(const hoNDArray<GT_Complex16>& x3D, const hoNDArray<GT_Complex16>& y4D, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimension(const hoNDArray<float>& x4D, const hoNDArray<float>& y5D, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimension(const hoNDArray<double>& x4D, const hoNDArray<double>& y5D, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimension(const hoNDArray<GT_Complex8>& x4D, const hoNDArray<GT_Complex8>& y5D, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimension(const hoNDArray<GT_Complex16>& x4D, const hoNDArray<GT_Complex16>& y5D, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimensionExcept(const hoNDArray<float>& x4D, const hoNDArray<float>& y5D, size_t n, hoNDArray<float>& r, bool copyY2R);
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimensionExcept(const hoNDArray<double>& x4D, const hoNDArray<double>& y5D, size_t n, hoNDArray<double>& r, bool copyY2R);
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimensionExcept(const hoNDArray<GT_Complex8>& x4D, const hoNDArray<GT_Complex8>& y5D, size_t n, hoNDArray<GT_Complex8>& r, bool copyY2R);
+    template EXPORTCPUCOREMATH bool multiplyOver4thDimensionExcept(const hoNDArray<GT_Complex16>& x4D, const hoNDArray<GT_Complex16>& y5D, size_t n, hoNDArray<GT_Complex16>& r, bool copyY2R);
+
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimension(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimension(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimension(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimension(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimensionExcept(const hoNDArray<float>& x, const hoNDArray<float>& y, size_t n, hoNDArray<float>& r, bool copyY2R);
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimensionExcept(const hoNDArray<double>& x, const hoNDArray<double>& y, size_t n, hoNDArray<double>& r, bool copyY2R);
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimensionExcept(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, size_t n, hoNDArray<GT_Complex8>& r, bool copyY2R);
+    template EXPORTCPUCOREMATH bool multiplyOver5thDimensionExcept(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, size_t n, hoNDArray<GT_Complex16>& r, bool copyY2R);
+
+    template EXPORTCPUCOREMATH bool multipleAdd(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool multipleAdd(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool multipleAdd(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool multipleAdd(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool multipleMultiply(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool multipleMultiply(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool multipleMultiply(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool multipleMultiply(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<short>& x, hoNDArray<short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<float>& x, hoNDArray<float>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<double>& x, hoNDArray<double>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+
+    template EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<short>& x, hoNDArray<short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<float>& x, hoNDArray<float>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<double>& x, hoNDArray<double>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+    template EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r, const std::vector<size_t>& start, std::vector<size_t>& size);
+
+    template EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<short>& x, hoNDArray<short>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r, size_t start, size_t end);
+
+    template EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<short>& x, hoNDArray<short>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<unsigned short>& x, hoNDArray<unsigned short>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r, size_t start, size_t end);
+    template EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r, size_t start, size_t end);
+
+    template EXPORTCPUCOREMATH bool stdOver3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& std, bool NMinusOne);
+    template EXPORTCPUCOREMATH bool stdOver3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& std, bool NMinusOne);
+    template EXPORTCPUCOREMATH bool stdOver3rdDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& std, bool NMinusOne);
+    template EXPORTCPUCOREMATH bool stdOver3rdDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& std, bool NMinusOne);
+
+    //template EXPORTCPUCOREMATH bool permuteLastTwoDimensions(const hoNDArray<float>& x, hoNDArray<float>& r);
+    //template EXPORTCPUCOREMATH bool permuteLastTwoDimensions(const hoNDArray<double>& x, hoNDArray<double>& r);
+    //template EXPORTCPUCOREMATH bool permuteLastTwoDimensions(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    //template EXPORTCPUCOREMATH bool permuteLastTwoDimensions(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permuteE2To3rdDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permuteE2To3rdDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permuteE2To3rdDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permuteE2To3rdDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permuteE2To5thDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permuteE2To5thDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permuteE2To5thDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permuteE2To5thDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permuteROTo3rdDimensionFor3DRecon(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo3rdDimensionFor3DRecon(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo3rdDimensionFor3DRecon(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo3rdDimensionFor3DRecon(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permuteROTo4thDimensionFor3DRecon(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo4thDimensionFor3DRecon(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo4thDimensionFor3DRecon(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo4thDimensionFor3DRecon(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permuteROTo1stDimensionFor3DRecon(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo1stDimensionFor3DRecon(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo1stDimensionFor3DRecon(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo1stDimensionFor3DRecon(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permute3rdDimensionTo1stDimension(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permute3rdDimensionTo1stDimension(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permute3rdDimensionTo1stDimension(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permute3rdDimensionTo1stDimension(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool permuteROTo5thDimensionFor3DRecon(const hoNDArray<float>& x, hoNDArray<float>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo5thDimensionFor3DRecon(const hoNDArray<double>& x, hoNDArray<double>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo5thDimensionFor3DRecon(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    template EXPORTCPUCOREMATH bool permuteROTo5thDimensionFor3DRecon(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template EXPORTCPUCOREMATH bool imageDomainUnwrapping2D(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& ker, hoNDArray<GT_Complex8>& buf, hoNDArray<GT_Complex8>& y);
+    template EXPORTCPUCOREMATH bool imageDomainUnwrapping2D(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& ker, hoNDArray<GT_Complex16>& buf, hoNDArray<GT_Complex16>& y);
+
+    template EXPORTCPUCOREMATH bool imageDomainUnwrapping2DT(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& ker, hoNDArray<GT_Complex8>& buf, hoNDArray<GT_Complex8>& y);
+    template EXPORTCPUCOREMATH bool imageDomainUnwrapping2DT(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& ker, hoNDArray<GT_Complex16>& buf, hoNDArray<GT_Complex16>& y);
+
+    #endif // USE_MKL
+
+    //
+    // Instantiation
+    //
+
+    template EXPORTCPUCOREMATH void clear<short>( hoNDArray<short>& );
+    template EXPORTCPUCOREMATH void clear<unsigned short>( hoNDArray<unsigned short>& );
+    template EXPORTCPUCOREMATH void clear<int>( hoNDArray<int>& );
+    template EXPORTCPUCOREMATH void clear<size_t>( hoNDArray<size_t>& );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void abs_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs_square<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > sqrt<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void sqrt_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > square<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void square_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > reciprocal<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > reciprocal_sqrt<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > sgn<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void sgn_inplace<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void clear<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void clear<float>( hoNDArray<float>& );
+    template EXPORTCPUCOREMATH void fill<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void clamp<float>( hoNDArray<float>*, float, float );
+    template EXPORTCPUCOREMATH void clamp_min<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void clamp_max<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void normalize<float>( hoNDArray<float>*, float );
+    template EXPORTCPUCOREMATH void shrink1<float>( hoNDArray<float>*, float, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void pshrink<float>( hoNDArray<float>*, float,float, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void shrinkd<float> ( hoNDArray<float>*, hoNDArray<float>*, float, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH void pshrinkd<float> ( hoNDArray<float>*, hoNDArray<float>*, float, float, hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void abs_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs_square<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > sqrt<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void sqrt_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > square<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void square_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > reciprocal<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > reciprocal_sqrt<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > sgn<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void sgn_inplace<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void clear<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void clear<double>( hoNDArray<double>& );
+    template EXPORTCPUCOREMATH void fill<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void clamp<double>( hoNDArray<double>*, double, double );
+    template EXPORTCPUCOREMATH void clamp_min<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void clamp_max<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void normalize<double>( hoNDArray<double>*, double );
+    template EXPORTCPUCOREMATH void shrink1<double>( hoNDArray<double>*, double, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void pshrink<double>( hoNDArray<double>*, double,double, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void shrinkd<double> ( hoNDArray<double>*, hoNDArray<double>*, double, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH void pshrinkd<double> ( hoNDArray<double>*, hoNDArray<double>*, double, double, hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs_square< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > sqrt< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > square< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void square_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > reciprocal< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > reciprocal_sqrt< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void clear< std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void clear< std::complex<float> >( hoNDArray< std::complex<float> >& );
+    template EXPORTCPUCOREMATH void fill< std::complex<float> >( hoNDArray< std::complex<float> >*, std::complex<float> );
+    template EXPORTCPUCOREMATH void clamp< std::complex<float> >( hoNDArray< std::complex<float> >*, float, float );
+    template EXPORTCPUCOREMATH void clamp_min< std::complex<float> >( hoNDArray< std::complex<float> >*, float );
+    template EXPORTCPUCOREMATH void clamp_max<std::complex<float> >( hoNDArray< std::complex<float> >*, float );
+    template EXPORTCPUCOREMATH void normalize< std::complex<float> >( hoNDArray< std::complex<float> >*, float );
+    template EXPORTCPUCOREMATH void shrink1< std::complex<float> >( hoNDArray< std::complex<float> >*, float, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void pshrink< std::complex<float> >( hoNDArray< std::complex<float> >*, float,float, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void shrinkd< std::complex<float> > ( hoNDArray< std::complex<float> >*, hoNDArray<float>*, float, hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< std::complex<float> > ( hoNDArray< std::complex<float> >*, hoNDArray<float>*, float, float, hoNDArray< std::complex<float> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs_square< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > sqrt< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > square< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void square_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > reciprocal< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > reciprocal_sqrt< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void clear< std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void clear< std::complex<double> >( hoNDArray< std::complex<double> >& );
+    template EXPORTCPUCOREMATH void fill< std::complex<double> >( hoNDArray< std::complex<double> >*, std::complex<double> );
+    template EXPORTCPUCOREMATH void clamp< std::complex<double> >( hoNDArray< std::complex<double> >*, double, double );
+    template EXPORTCPUCOREMATH void clamp_min< std::complex<double> >( hoNDArray< std::complex<double> >*, double );
+    template EXPORTCPUCOREMATH void clamp_max<std::complex<double> >( hoNDArray< std::complex<double> >*, double );
+    template EXPORTCPUCOREMATH void normalize< std::complex<double> >( hoNDArray< std::complex<double> >*, double );
+    template EXPORTCPUCOREMATH void shrink1< std::complex<double> >( hoNDArray< std::complex<double> >*, double, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void pshrink< std::complex<double> >( hoNDArray< std::complex<double> >*, double,double, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void shrinkd< std::complex<double> > ( hoNDArray< std::complex<double> >*, hoNDArray<double>*, double, hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< std::complex<double> > ( hoNDArray< std::complex<double> >*, hoNDArray<double>*, double, double, hoNDArray< std::complex<double> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > abs_square< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > sqrt< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > square< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void square_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > reciprocal< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<float> > > reciprocal_sqrt< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void clear< complext<float> >( hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void clear< complext<float> >( hoNDArray< complext<float> >& );
+    template EXPORTCPUCOREMATH void fill< complext<float> >( hoNDArray< complext<float> >*, complext<float> );
+    template EXPORTCPUCOREMATH void clamp< complext<float> >( hoNDArray< complext<float> >*, float, float );
+    template EXPORTCPUCOREMATH void clamp_min< complext<float> >( hoNDArray< complext<float> >*, float );
+    template EXPORTCPUCOREMATH void clamp_max<complext<float> >( hoNDArray< complext<float> >*, float );
+    template EXPORTCPUCOREMATH void normalize< complext<float> >( hoNDArray< complext<float> >*, float );
+    template EXPORTCPUCOREMATH void shrink1< complext<float> >( hoNDArray< complext<float> >*, float, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void pshrink< complext<float> >( hoNDArray< complext<float> >*, float,float, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void shrinkd< complext<float> > ( hoNDArray< complext<float> >*, hoNDArray<float>*, float, hoNDArray< complext<float> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< complext<float> > ( hoNDArray< complext<float> >*, hoNDArray<float>*, float, float, hoNDArray< complext<float> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > abs_square< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > sqrt< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void sqrt_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > square< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void square_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > reciprocal< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< complext<double> > > reciprocal_sqrt< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void reciprocal_sqrt_inplace< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void clear< complext<double> >( hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void clear< complext<double> >( hoNDArray< complext<double> >& );
+    template EXPORTCPUCOREMATH void fill< complext<double> >( hoNDArray< complext<double> >*, complext<double> );
+    template EXPORTCPUCOREMATH void clamp< complext<double> >( hoNDArray< complext<double> >*, double, double );
+    template EXPORTCPUCOREMATH void clamp_min< complext<double> >( hoNDArray< complext<double> >*, double );
+    template EXPORTCPUCOREMATH void clamp_max<complext<double> >( hoNDArray< complext<double> >*, double );
+    template EXPORTCPUCOREMATH void normalize< complext<double> >( hoNDArray< complext<double> >*, double );
+    template EXPORTCPUCOREMATH void shrink1< complext<double> >( hoNDArray< complext<double> >*, double, hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void pshrink< complext<double> >( hoNDArray< complext<double> >*, double,double, hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void shrinkd< complext<double> > ( hoNDArray< complext<double> >*, hoNDArray<double>*, double, hoNDArray< complext<double> >* );
+    template EXPORTCPUCOREMATH void pshrinkd< complext<double> > ( hoNDArray< complext<double> >*, hoNDArray<double>*, double, double, hoNDArray< complext<double> >* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > real_to_complex< std::complex<float> >( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<float> > > real_imag_to_complex< std::complex<float> >( hoNDArray<float>*, hoNDArray<float>* );
+
+    template EXPORTCPUCOREMATH bool real_imag_to_complex(const hoNDArray<float>& real, const hoNDArray<float>& imag, hoNDArray< std::complex<float> >& cplx);
+    template EXPORTCPUCOREMATH bool real_imag_to_complex(const hoNDArray<float>& real, const hoNDArray<float>& imag, hoNDArray< float_complext >& cplx);
+
+    template EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& real, hoNDArray<float>& imag);
+    //template EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray< float_complext >& cplx, hoNDArray<float>& real, hoNDArray<float>& imag);
+
+    template EXPORTCPUCOREMATH bool complex_to_real(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& real);
+    template EXPORTCPUCOREMATH bool complex_to_imag(const hoNDArray< std::complex<float> >& cplx, hoNDArray<float>& imag);
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float_complext> > real_to_complex<float_complext>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float_complext> > real_imag_to_complex<float_complext>( hoNDArray<float>*, hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > real<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > real<std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > real<float_complext>( hoNDArray<float_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > imag<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > imag<std::complex<float> >( hoNDArray< std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > imag<float_complext>( hoNDArray<float_complext>* );
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float> > conj<float>( hoNDArray<float>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<std::complex<float> > > conj<std::complex<float> >( hoNDArray<std::complex<float> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<float_complext> > conj<float_complext>( hoNDArray<float_complext>* );
+
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > real_to_complex< std::complex<double> >( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray< std::complex<double> > > real_imag_to_complex< std::complex<double> >( hoNDArray<double>*, hoNDArray<double>* );
+
+    template EXPORTCPUCOREMATH bool real_imag_to_complex(const hoNDArray<double>& real, const hoNDArray<double>& imag, hoNDArray< std::complex<double> >& cplx);
+    template EXPORTCPUCOREMATH bool real_imag_to_complex(const hoNDArray<double>& real, const hoNDArray<double>& imag, hoNDArray< double_complext >& cplx);
+
+    template EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& real, hoNDArray<double>& imag);
+    //template EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray< double >& cplx, hoNDArray<double>& real, hoNDArray<double>& imag);
+    //template EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray< float >& cplx, hoNDArray<float>& real, hoNDArray<float>& imag);
+    // template EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray< double_complext >& cplx, hoNDArray<double>& real, hoNDArray<double>& imag);
+
+    template EXPORTCPUCOREMATH bool complex_to_real(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& real);
+    template EXPORTCPUCOREMATH bool complex_to_imag(const hoNDArray< std::complex<double> >& cplx, hoNDArray<double>& imag);
+
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double_complext> > real_to_complex<double_complext>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double_complext> > real_imag_to_complex<double_complext>( hoNDArray<double>*, hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > real<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > real<std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > real<double_complext>( hoNDArray<double_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > imag<std::complex<double> >( hoNDArray< std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > imag<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > imag<double_complext>( hoNDArray<double_complext>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double> > conj<double>( hoNDArray<double>* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<std::complex<double> > > conj<std::complex<double> >( hoNDArray<std::complex<double> >* );
+    template EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<double_complext> > conj<double_complext>( hoNDArray<double_complext>* );
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_elemwise.h b/toolboxes/core/cpu/arma_math/hoNDArray_elemwise.h
new file mode 100644
index 0000000..5ed76b6
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_elemwise.h
@@ -0,0 +1,400 @@
+/** \file hoNDArray_elemwise.h
+    \brief Element-wise math operations on the hoNDArray class.
+    
+    hoNDArray_elementwise.h defines element-wise array operations on the hoNDArray class.
+    Many of the provided functions come in two flavours:
+    1) A function that returns a smart pointer to a new array holding the result of the element-wise operation, and
+    2) A function that perform in-place element-wise computation replacing the input array.
+    When both versions are available the in-place version is suffixed _inplace.
+    Some functions (clear, fill, clamp, clamp_min, clamp_max, normalize, shrink1, shrinkd) are only provided as in-place operations,
+    and they do not carry the _inplace suffix in order to keep user code compact.
+    A few functions return a different type as its input array 
+    (abs on complex data, real, imag, real_to_std_complex, real_to_complext) and consequently is not offered as an in place operation.
+    The functions provided in hoNDArray_elemwise are deliberatly placed outside the NDArray derived classes
+    - to allow the NDArray classes to be lightweight header only data containers for both the cpu and gpu instances
+    - to allow for external library optimized implementations of the element-wise functions without adding such dependencies to the core data container
+    The present cpu implementation is based on Armadillo (whenever suitable functions are available).
+    The implementation is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double, std::complex<float>, std::complex<double>, 
+    Gadgetron::complext<float> and Gadgetron::complext<double> -- with some deliberate omissions.
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "cpucore_math_export.h"
+
+#include "GadgetronCommon.h"
+#include <complex>
+
+#ifdef USE_MKL
+#include "mkl.h"
+#endif // USE_MKL
+
+#ifdef GT_Complex8
+#undef GT_Complex8
+#endif // GT_Complex8
+typedef std::complex<float> GT_Complex8;
+
+#ifdef GT_Complex16
+#undef GT_Complex16
+#endif // GT_Complex16
+typedef std::complex<double> GT_Complex16;
+
+namespace Gadgetron{
+
+  /**
+   * @brief Calculates the element-wise absolute values (l2 norm) of the array entries
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise absolute values of the input.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs( hoNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise absolute values (l2 norm) of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void abs_inplace( hoNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise squared absolute values of the array entries
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise absolute values of the input.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > abs_square( hoNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise sqrt of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise sqrt of the input.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > sqrt( hoNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise sqrt of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void sqrt_inplace( hoNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise square of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise square of the input.
+   *
+   * For real numbers this functions is equivalent to square. 
+   * For complex arrays abs_square() and square() differ however.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > square( hoNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise square of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void square_inplace( hoNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise reciprocal of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise reciprocal of the input.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > reciprocal( hoNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void reciprocal_inplace( hoNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal sqrt of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise reciprocal sqrt of the input.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > reciprocal_sqrt( hoNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal sqrt of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void reciprocal_sqrt_inplace( hoNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the elementwise signum function on the array.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise sgn of the input.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > sgn( hoNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the elementwise signum function on the array (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void sgn_inplace( hoNDArray<T> *x );
+
+  /**
+   * @brief Extract the real component from a complex array.
+   * @param[in] x Input array.
+   * @return A new array of the real component of the complex array.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > real( hoNDArray<T> *x );
+
+  /**
+   * @brief Extract the imaginary component from a complex array.
+   * @param[in] x Input array.
+   * @return A new array of the imaginary component of the complex array.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<typename realType<T>::Type> > imag( hoNDArray<T> *x );
+
+  /**
+   * @brief Create a new array of the complex conjugate of the input array. For real arrays a copy of the input array is return.
+   * @param[in] x Input array.
+   * @return A new array of the complex conjugate of the input array.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > conj( hoNDArray<T> *x );
+
+  /**
+   * @brief Construct a complex array from a real array.
+   * @param[in] x Input array.
+   * @return A new complex array containing the input array in the real component and zeros in the imaginary component.
+   */
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > 
+  real_to_complex( hoNDArray<typename realType<T>::Type> *x );
+
+  template<class T> EXPORTCPUCOREMATH boost::shared_ptr< hoNDArray<T> > 
+  real_imag_to_complex( hoNDArray<typename realType<T>::Type> *real, hoNDArray<typename realType<T>::Type>* imag);
+
+  template<class T> EXPORTCPUCOREMATH bool 
+  real_imag_to_complex(const hoNDArray<typename realType<T>::Type>& real, const hoNDArray<typename realType<T>::Type>& imag, hoNDArray<T>& cplx);
+
+  template<class T> EXPORTCPUCOREMATH bool 
+  complex_to_real_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real, hoNDArray<typename realType<T>::Type>& imag);
+
+  template<> EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray<float>& cplx, hoNDArray<float>& real, hoNDArray<float>& imag);
+  template<> EXPORTCPUCOREMATH bool complex_to_real_imag(const hoNDArray<double>& cplx, hoNDArray<double>& real, hoNDArray<double>& imag);
+
+  template<class T> EXPORTCPUCOREMATH bool complex_to_real(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& real);
+  template<class T> EXPORTCPUCOREMATH bool complex_to_imag(const hoNDArray<T>& cplx, hoNDArray<typename realType<T>::Type>& imag);
+
+  //
+  // From hereon the functions are all in-place although without the _inplace suffix...
+  //
+
+  /**
+   * @brief Clears the array to all zeros ( in place). Faster than fill.
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTCPUCOREMATH void clear( hoNDArray<T> *x );
+  template<class T> EXPORTCPUCOREMATH void clear( hoNDArray<T>& x );
+
+  /**
+   * @brief Fills the array with a user provided constant value (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] val Fill value.
+   */
+  template<class T> EXPORTCPUCOREMATH void fill( hoNDArray<T> *x, T val );
+
+  /**
+   * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min minimum value.
+   * @param[in] max maximum value.
+   * @param[in] min_val value to which everything below the minimum will be set
+   * @param[in] max_val value to which everything above the maximum will be set
+   */
+  template<class T> EXPORTCPUCOREMATH void clamp( hoNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val );
+  
+  /**
+   * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min minimum value.
+   * @param[in] max maximum value.
+   */
+  template<class T> EXPORTCPUCOREMATH void clamp( hoNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max );
+
+  /**
+   * @brief Clamps all values in the array to a minimum value allowed (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min Minimum value.
+   */
+  template<class T> EXPORTCPUCOREMATH void clamp_min( hoNDArray<T> *x, typename realType<T>::Type min );
+
+  /**
+   * @brief Clamps all values in the array to a maximum value allowed (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] max Maximum value.
+   */
+  template<class T> EXPORTCPUCOREMATH void clamp_max( hoNDArray<T> *x, typename realType<T>::Type max );
+
+  /**
+   * @brief In place normalization (scaling) to a new maximum absolute array value val.
+   * @param[in,out] x Input and output array.
+   * @param[in] val New maximum absolute array value (according to the l2-norm)
+   */  
+  template<class T> EXPORTCPUCOREMATH void normalize( hoNDArray<T> *x, typename realType<T>::Type val = typename realType<T>::Type(1) );
+
+  /**
+   * @brief Shrinkage (soft thresholding), i.e. shrink(x,gamma) = x/abs(x)*max(abs(x)-gamma,0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   */  
+  template<class T> EXPORTCPUCOREMATH void shrink1( hoNDArray<T> *x, typename realType<T>::Type gamma, hoNDArray<T> *out = 0x0 );
+
+  /**
+   * @brief In place p-shrinkage (soft thresholding), i.e. pshrink(x,gamma,p) = x/abs(x)*max(abs(x)-gamma*abs(x)^(p-1),0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+   */
+	template<class T> EXPORTCPUCOREMATH void pshrink( hoNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out = 0x0 );
+
+  /**
+   * @brief Shrinkage (soft thresholding, multi-dimensional), i.e. shrink(x,gamma,s) = x/s*max(s-gamma,0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] s Input array, normalization.
+   * @param[in] gamma Shrinkage control parameter
+   */  
+  template<class T> EXPORTCPUCOREMATH void shrinkd ( hoNDArray<T> *x, hoNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma, hoNDArray<T> *out = 0x0 );
+
+  /**
+   * @brief In place p-shrinkage (soft thresholding, multi-dimensional), i.e. pshrink(x,s,gamma,p) = x/s*max(s-gamma*s^(p-1),0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+   */
+  template<class T> EXPORTCPUCOREMATH void pshrinkd ( hoNDArray<T> *x, hoNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma,typename realType<T>::Type p, hoNDArray<T> *out = 0x0 );
+
+#ifdef USE_MKL
+
+    // besides the arma calls, some functions are implemented with the MKL vector utilities
+
+    EXPORTCPUCOREMATH bool add(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r); // r = x + y
+    EXPORTCPUCOREMATH bool subtract(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r); // r = x - y
+    EXPORTCPUCOREMATH bool multiply(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r); // r = x * y
+    EXPORTCPUCOREMATH bool divide(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& r); // r = x / y
+    EXPORTCPUCOREMATH bool absolute(const hoNDArray<float>& x, hoNDArray<float>& r); // r = abs(x)
+    EXPORTCPUCOREMATH bool argument(const hoNDArray<float>& x, hoNDArray<float>& r); // r = angle(x)
+    EXPORTCPUCOREMATH bool sqrt(const hoNDArray<float>& x, hoNDArray<float>& r); // r = sqrt(x)
+    EXPORTCPUCOREMATH bool minAbsolute(const hoNDArray<float>& x, float& r, size_t& ind); // minimal absolute value and index
+    EXPORTCPUCOREMATH bool maxAbsolute(const hoNDArray<float>& x, float& r, size_t& ind); // maximal absolute value and index
+    EXPORTCPUCOREMATH bool addEpsilon(hoNDArray<float>& x); // x = x + Epsilon if x==0, prepare for division
+    EXPORTCPUCOREMATH bool norm2(const hoNDArray<float>& x, float& r);
+    EXPORTCPUCOREMATH bool norm1(const hoNDArray<float>& x, float& r);
+    EXPORTCPUCOREMATH bool conv2(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& z); // x: input data, y: convolution kernel, z: output; each 2D slice is convolved
+    EXPORTCPUCOREMATH bool conv3(const hoNDArray<float>& x, const hoNDArray<float>& y, hoNDArray<float>& z); // x: input data, y: convolution kernel, z: output; each 3D volume is convolved
+    EXPORTCPUCOREMATH bool inv(const hoNDArray<float>& x, hoNDArray<float>& r); // r = 1/x
+
+    EXPORTCPUCOREMATH bool add(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool subtract(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool multiply(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool divide(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool absolute(const hoNDArray<double>& x, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool argument(const hoNDArray<double>& x, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool sqrt(const hoNDArray<double>& x, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool minAbsolute(const hoNDArray<double>& x, double& r, size_t& ind);
+    EXPORTCPUCOREMATH bool maxAbsolute(const hoNDArray<double>& x, double& r, size_t& ind);
+    EXPORTCPUCOREMATH bool addEpsilon(hoNDArray<double>& x);
+    EXPORTCPUCOREMATH bool norm2(const hoNDArray<double>& x, double& r);
+    EXPORTCPUCOREMATH bool norm1(const hoNDArray<double>& x, double& r);
+    EXPORTCPUCOREMATH bool conv2(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& z);
+    EXPORTCPUCOREMATH bool conv3(const hoNDArray<double>& x, const hoNDArray<double>& y, hoNDArray<double>& z);
+    EXPORTCPUCOREMATH bool inv(const hoNDArray<double>& x, hoNDArray<double>& r);
+
+    EXPORTCPUCOREMATH bool add(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    EXPORTCPUCOREMATH bool subtract(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    EXPORTCPUCOREMATH bool multiply(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    EXPORTCPUCOREMATH bool divide(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r);
+    EXPORTCPUCOREMATH bool absolute(const hoNDArray<GT_Complex8>& x, hoNDArray<float>& r);
+    EXPORTCPUCOREMATH bool absolute(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    EXPORTCPUCOREMATH bool sqrt(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+    EXPORTCPUCOREMATH bool minAbsolute(const hoNDArray<GT_Complex8>& x, GT_Complex8& r, size_t& ind);
+    EXPORTCPUCOREMATH bool maxAbsolute(const hoNDArray<GT_Complex8>& x, GT_Complex8& r, size_t& ind);
+    EXPORTCPUCOREMATH bool multiplyConj(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& r); // r = x * conj(y)
+    EXPORTCPUCOREMATH bool argument(const hoNDArray<GT_Complex8>& x, hoNDArray<float>& r); // r = angle(x)
+    EXPORTCPUCOREMATH bool conjugate(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r); // r = conj(x)
+    EXPORTCPUCOREMATH bool addEpsilon(hoNDArray<GT_Complex8>& x);
+    EXPORTCPUCOREMATH bool norm2(const hoNDArray<GT_Complex8>& x, float& r);
+    EXPORTCPUCOREMATH bool norm1(const hoNDArray<GT_Complex8>& x, float& r);
+    EXPORTCPUCOREMATH bool dotc(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, GT_Complex8& r); // x'*y, x and y are N*1 vector
+    EXPORTCPUCOREMATH bool conv2(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& z);
+    EXPORTCPUCOREMATH bool conv3(const hoNDArray<GT_Complex8>& x, const hoNDArray<GT_Complex8>& y, hoNDArray<GT_Complex8>& z);
+    EXPORTCPUCOREMATH bool inv(const hoNDArray<GT_Complex8>& x, hoNDArray<GT_Complex8>& r);
+
+    EXPORTCPUCOREMATH bool add(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool subtract(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool multiply(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool divide(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool absolute(const hoNDArray<GT_Complex16>& x, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool absolute(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool sqrt(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool minAbsolute(const hoNDArray<GT_Complex16>& x, GT_Complex16& r, size_t& ind);
+    EXPORTCPUCOREMATH bool maxAbsolute(const hoNDArray<GT_Complex16>& x, GT_Complex16& r, size_t& ind);
+    EXPORTCPUCOREMATH bool multiplyConj(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool argument(const hoNDArray<GT_Complex16>& x, hoNDArray<double>& r);
+    EXPORTCPUCOREMATH bool conjugate(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+    EXPORTCPUCOREMATH bool addEpsilon(hoNDArray<GT_Complex16>& x);
+    EXPORTCPUCOREMATH bool norm2(const hoNDArray<GT_Complex16>& x, double& r);
+    EXPORTCPUCOREMATH bool norm1(const hoNDArray<GT_Complex16>& x, double& r);
+    EXPORTCPUCOREMATH bool dotc(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, GT_Complex16& r);
+    EXPORTCPUCOREMATH bool conv2(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& z);
+    EXPORTCPUCOREMATH bool conv3(const hoNDArray<GT_Complex16>& x, const hoNDArray<GT_Complex16>& y, hoNDArray<GT_Complex16>& z);
+    EXPORTCPUCOREMATH bool inv(const hoNDArray<GT_Complex16>& x, hoNDArray<GT_Complex16>& r);
+
+    template<typename T> EXPORTCPUCOREMATH bool sumOverLastDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 4D array, sum over the 4th dimension
+    template<typename T> EXPORTCPUCOREMATH bool sumOverSecondLastDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 4D array, sum over the 3rd dimension
+
+    template<typename T> EXPORTCPUCOREMATH bool multiplyOverLastDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r); // e.g. x is 3D and y is 4D array, r(:,:,:,n) = y(:,:,:,n) .* x
+    template<typename T> EXPORTCPUCOREMATH bool divideOverLastDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r); // e.g. x is 3D and y is 4D array, r(:,:,:,n) = y(:,:,:,n) ./ x
+
+    template<typename T> EXPORTCPUCOREMATH bool sumOver1stDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 2D array, sum over the 1st dimension and get an array of [1 E1]
+    template<typename T> EXPORTCPUCOREMATH bool sumOver2ndDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 3D array, sum over the 2nd dimension and get an array of [RO 1 CHA]
+    template<typename T> EXPORTCPUCOREMATH bool sumOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 4D array, sum over the 3rd dimension and get an array of [RO E1 1 N]
+    template<typename T> EXPORTCPUCOREMATH bool sumOver4thDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 5D array [RO E1 CHA N S], sum over the 4th dimension and get an array of [RO E1 CHA 1 S]
+    template<typename T> EXPORTCPUCOREMATH bool sumOver5thDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // e.g. for a 6D array, sum over the 5th dimension and get an array [RO E1 CHA N 1 P]
+
+    template<typename T> EXPORTCPUCOREMATH bool multiplyOver3rdDimension(const hoNDArray<T>& x3D, const hoNDArray<T>& y4D, hoNDArray<T>& r); // e.g. x is 3D and y is 4D array, r(:,:,n,:) = y(:,:,n,:) .* x
+    template<typename T> EXPORTCPUCOREMATH bool multiplyOver4thDimension(const hoNDArray<T>& x4D, const hoNDArray<T>& y5D, hoNDArray<T>& r); // e.g. x is 4D and y is 5D array, r(:,:,:,n,:) = y(:,:,:,n,:) .* x
+    template<typename T> EXPORTCPUCOREMATH bool multiplyOver5thDimension(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r); // e.g. x is 5D and y is 6D array, r(:,:,:,:, n,:) = y(:,:,:,:,n,:) .* x
+
+    template<typename T> EXPORTCPUCOREMATH bool multiplyOver4thDimensionExcept(const hoNDArray<T>& x4D, const hoNDArray<T>& y5D, size_t n, hoNDArray<T>& r, bool copyY2R=true); // e.g. x is 4D and y is 5D array, r(:,:,:,t,:) = y(:,:,:,t,:) .* x, except for r(:,:,:,n,:) = y(:,:,:,n,:)
+    template<typename T> EXPORTCPUCOREMATH bool multiplyOver5thDimensionExcept(const hoNDArray<T>& x, const hoNDArray<T>& y, size_t n, hoNDArray<T>& r, bool copyY2R=true); // e.g. x is 5D and y is 6D array, r(:,:,:,:,t,:) = y(:,:,:,:,t,:) .* x, except for r(:,:,:,:,n,:) = y(:,:,:,:,n,:)
+
+    template<typename T> EXPORTCPUCOREMATH bool multipleAdd(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r); // r = x + y for every part of y
+    template<typename T> EXPORTCPUCOREMATH bool multipleMultiply(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r); // r = x * y for every part of y
+    template<typename T> EXPORTCPUCOREMATH bool multipleDivide(const hoNDArray<T>& x, const hoNDArray<T>& y, hoNDArray<T>& r); // r = x / y for every part of y
+
+    template<typename T> EXPORTCPUCOREMATH bool cropUpTo10DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size);
+    template<typename T> EXPORTCPUCOREMATH bool setSubArrayUpTo10DArray(const hoNDArray<T>& x, hoNDArray<T>& r, const std::vector<size_t>& startND, std::vector<size_t>& size);
+
+    template<typename T> EXPORTCPUCOREMATH bool cropOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end);
+    template<typename T> EXPORTCPUCOREMATH bool setSubArrayOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r, size_t start, size_t end);
+
+    template<typename T> EXPORTCPUCOREMATH bool stdOver3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& std, bool NMinusOne); // compute the standard deviation along the 3rd dimension, if NMinusOne == true, divided by N-1; otherwise, divided by N
+
+    // template<typename T> EXPORTCPUCOREMATH bool permuteLastTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [... E1 E2], r: [... E2 E1]
+
+    template<typename T> EXPORTCPUCOREMATH bool permuteE2To3rdDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [RO E1 CHA SLC E2 ...], r: [RO E1 E2 CHA SLC ...]
+    template<typename T> EXPORTCPUCOREMATH bool permuteE2To5thDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [RO E1 E2 CHA SLC ...], r: [RO E1 CHA SLC E2 ...]
+
+    template<typename T> EXPORTCPUCOREMATH bool permuteROTo3rdDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [RO E1 E2 ...], r: [E1 E2 RO ...]
+    template<typename T> EXPORTCPUCOREMATH bool permuteROTo4thDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [RO E1 E2 CHA ...], r: [E1 E2 CHA RO ...]
+    template<typename T> EXPORTCPUCOREMATH bool permuteROTo1stDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [E1 E2 CHA RO ...], r: [RO E1 E2 CHA ...]
+
+    template<typename T> EXPORTCPUCOREMATH bool permute3rdDimensionTo1stDimension(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [RO E1 E2 CHA ...], r: [E2 RO E1 CHA ...]
+
+    template<typename T> EXPORTCPUCOREMATH bool permuteROTo5thDimensionFor3DRecon(const hoNDArray<T>& x, hoNDArray<T>& r); // x : [RO E1 E2 srcCHA dstCHA ...], r: [E1 E2 srcCHA dstCHA RO ...]
+
+    /// x : [RO E1 srcCHA], ker [RO E1 srcCHA dstCHA], buf is a buffer for computer, need to be pre-allocated [RO E1 srcCHA], y [RO E1 dstCHA]
+    /// for the sake of speed, no check is made in this function
+    template<typename T> EXPORTCPUCOREMATH bool imageDomainUnwrapping2D(const hoNDArray<T>& x, const hoNDArray<T>& ker, hoNDArray<T>& buf, hoNDArray<T>& y);
+
+    /// x : [RO E1 srcCHA N], ker [RO E1 srcCHA dstCHA 1 or N], buf is a buffer for computer, need to be pre-allocated [RO E1 srcCHA], y [RO E1 dstCHA N]
+    /// for the sake of speed, no check is made in this function
+    template<typename T> EXPORTCPUCOREMATH bool imageDomainUnwrapping2DT(const hoNDArray<T>& x, const hoNDArray<T>& ker, hoNDArray<T>& buf, hoNDArray<T>& y);
+
+#endif // USE_MKL
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_math.h b/toolboxes/core/cpu/arma_math/hoNDArray_math.h
new file mode 100644
index 0000000..a8b4224
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_math.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "hoNDArray_blas.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_operators.h"
+#include "hoNDArray_reductions.h"
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_operators.cpp b/toolboxes/core/cpu/arma_math/hoNDArray_operators.cpp
new file mode 100644
index 0000000..62e5a10
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_operators.cpp
@@ -0,0 +1,457 @@
+#include "hoNDArray_operators.h"
+#include "hoArmadillo.h"
+
+namespace Gadgetron{
+
+    // Private utility to verify array dimensions. 
+    // It "replaces" NDArray::dimensions_equal() to support batch mode.
+    // There is an identical function for all array instances (currently hoNDArray, cuNDArray, hoCuNDAraay)
+    // !!! Remember to fix any bugs in all versions !!!
+    //
+    template<class T,class S> static bool compatible_dimensions( const hoNDArray<T> &x, const hoNDArray<S> &y )
+    {
+        return ((x.get_number_of_elements()%y.get_number_of_elements())==0);
+    }
+
+    template<class T> hoNDArray<T>& operator+= (hoNDArray<T> &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<T,T>(x,y) ){
+            arma::Col<typename stdType<T>::Type> aY = as_arma_col(&y);
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray<T> tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&tmp);
+                aRes += aY;
+            }
+            return x;    
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator+=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator+= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<std::complex<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< std::complex<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes += aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator+=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator+= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<complext<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< complext<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes += aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator+=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray<T>& operator+= (hoNDArray<T> &x, const T &y)
+    {
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes += aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator+= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes += aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator+= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes += aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray<T>& operator-= (hoNDArray<T> &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<T,T>(x,y) ){
+            arma::Col<typename stdType<T>::Type> aY = as_arma_col(&y);
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray<T> tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&tmp);
+                aRes -= aY;
+            }
+            return x;    
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator-=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator-= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<std::complex<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< std::complex<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes -= aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator-=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator-= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<complext<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< complext<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes -= arma::Col< std::complex<T> >( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator-=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray<T>& operator-= (hoNDArray<T> &x, const T &y)
+    {
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes -= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator-= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes -= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator-= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes -= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray<T>& operator*= (hoNDArray<T> &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<T,T>(x,y) ){
+            arma::Col<typename stdType<T>::Type> aY = as_arma_col(&y);
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray<T> tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&tmp);
+                aRes %= aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator*=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator*= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<std::complex<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );    
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< std::complex<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes %= aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator*=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator*= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<complext<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< complext<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes %= arma::Col< std::complex<T> >( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator*=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray<T>& operator*= (hoNDArray<T> &x, const T &y)
+    {
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes *= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator*= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes *= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator*= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes *= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray<T>& operator/= (hoNDArray<T> &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<T,T>(x,y) ){
+            arma::Col<typename stdType<T>::Type> aY = as_arma_col(&y);
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray<T> tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&tmp);
+                aRes /= aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator/=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator/= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<std::complex<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );    
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< std::complex<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes /= aY;
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator/=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator/= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y)
+    {
+        if( compatible_dimensions<complext<T>,T>(x,y) ){
+            arma::Col< std::complex<T> > aY( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            size_t num_batches = x.get_number_of_elements()/y.get_number_of_elements();
+            for( size_t batch=0; batch<num_batches; batch++ ){	
+                hoNDArray< complext<T> > tmp;
+                tmp.create( y.get_dimensions(), x.get_data_ptr()+batch*y.get_number_of_elements() );
+                arma::Col< std::complex<T> > aRes = as_arma_col(&tmp);
+                aRes /= arma::Col< std::complex<T> >( as_arma_col(&y), arma::Col<T>(y.get_number_of_elements()).zeros() );
+            }
+            return x;
+        } 
+        else {
+            throw std::runtime_error("hoNDArray::operator/=: Incompatible array dimensions");
+        }
+    }
+
+    template<class T> hoNDArray<T>& operator/= (hoNDArray<T> &x, const T &y)
+    {
+        arma::Col<typename stdType<T>::Type> aRes = as_arma_col(&x);
+        typename stdType<T>::Type aY = *((typename stdType<T>::Type*)&y);
+        aRes /= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< std::complex<T> >& operator/= (hoNDArray< std::complex<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes /= aY;
+        return x;  
+    }
+
+    template<class T> hoNDArray< complext<T> >& operator/= (hoNDArray< complext<T> > &x, const T &y)
+    {
+        arma::Col< std::complex<T> > aRes = as_arma_col(&x);
+        std::complex<T> aY( y, T(0) );
+        aRes /= aY;
+        return x;  
+    }
+
+    //
+    // Instantiation
+    //
+
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator+=<float>(hoNDArray<float>&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator+=<float>(hoNDArray<float>&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator-=<float>(hoNDArray<float>&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator-=<float>(hoNDArray<float>&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator*=<float>(hoNDArray<float>&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator*=<float>(hoNDArray<float>&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator/=<float>(hoNDArray<float>&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray<float>& operator/=<float>(hoNDArray<float>&, const float&);
+
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator+=<double>(hoNDArray<double>&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator+=<double>(hoNDArray<double>&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator-=<double>(hoNDArray<double>&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator-=<double>(hoNDArray<double>&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator*=<double>(hoNDArray<double>&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator*=<double>(hoNDArray<double>&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator/=<double>(hoNDArray<double>&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray<double>& operator/=<double>(hoNDArray<double>&, const double&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator+=< std::complex<float> > 
+        (hoNDArray< std::complex<float> >&, const hoNDArray< std::complex<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator+=< std::complex<float> > 
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator-=< std::complex<float> > 
+        (hoNDArray< std::complex<float> >&, const hoNDArray< std::complex<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator-=< std::complex<float> > 
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator*=< std::complex<float> >
+        (hoNDArray< std::complex<float> >&, const hoNDArray< std::complex<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator*=< std::complex<float> >
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator/=< std::complex<float> > 
+        (hoNDArray< std::complex<float> >&, const hoNDArray< std::complex<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator/=< std::complex<float> > 
+        (hoNDArray< std::complex<float> >&, const std::complex<float>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator+=< complext<float> > 
+        (hoNDArray< complext<float> >&, const hoNDArray< complext<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator+=< complext<float> > 
+        (hoNDArray< complext<float> >&, const complext<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator-=< complext<float> > 
+        (hoNDArray< complext<float> >&, const hoNDArray< complext<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator-=< complext<float> > 
+        (hoNDArray< complext<float> >&, const complext<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator*=< complext<float> >
+        (hoNDArray< complext<float> >&, const hoNDArray< complext<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator*=< complext<float> >
+        (hoNDArray< complext<float> >&, const complext<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator/=< complext<float> > 
+        (hoNDArray< complext<float> >&, const hoNDArray< complext<float> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator/=< complext<float> > 
+        (hoNDArray< complext<float> >&, const complext<float>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator+=<float>(hoNDArray< std::complex<float> >&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator-=<float>(hoNDArray< std::complex<float> >&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator*=<float>(hoNDArray< std::complex<float> >&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator/=<float>(hoNDArray< std::complex<float> >&, const hoNDArray<float>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator+=<float>(hoNDArray< complext<float> >&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator-=<float>(hoNDArray< complext<float> >&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator*=<float>(hoNDArray< complext<float> >&, const hoNDArray<float>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator/=<float>(hoNDArray< complext<float> >&, const hoNDArray<float>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator+=<float>(hoNDArray< std::complex<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator-=<float>(hoNDArray< std::complex<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator*=<float>(hoNDArray< std::complex<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<float> >& operator/=<float>(hoNDArray< std::complex<float> >&, const float&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator+=<float>(hoNDArray< complext<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator-=<float>(hoNDArray< complext<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator*=<float>(hoNDArray< complext<float> >&, const float&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<float> >& operator/=<float>(hoNDArray< complext<float> >&, const float&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator+=< std::complex<double> > 
+        (hoNDArray< std::complex<double> >&, const hoNDArray< std::complex<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator+=< std::complex<double> > 
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator-=< std::complex<double> > 
+        (hoNDArray< std::complex<double> >&, const hoNDArray< std::complex<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator-=< std::complex<double> > 
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator*=< std::complex<double> >
+        (hoNDArray< std::complex<double> >&, const hoNDArray< std::complex<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator*=< std::complex<double> >
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator/=< std::complex<double> > 
+        (hoNDArray< std::complex<double> >&, const hoNDArray< std::complex<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator/=< std::complex<double> > 
+        (hoNDArray< std::complex<double> >&, const std::complex<double>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator+=< complext<double> > 
+        (hoNDArray< complext<double> >&, const hoNDArray< complext<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator+=< complext<double> > 
+        (hoNDArray< complext<double> >&, const complext<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator-=< complext<double> > 
+        (hoNDArray< complext<double> >&, const hoNDArray< complext<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator-=< complext<double> > 
+        (hoNDArray< complext<double> >&, const complext<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator*=< complext<double> >
+        (hoNDArray< complext<double> >&, const hoNDArray< complext<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator*=< complext<double> >
+        (hoNDArray< complext<double> >&, const complext<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator/=< complext<double> > 
+        (hoNDArray< complext<double> >&, const hoNDArray< complext<double> >&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator/=< complext<double> > 
+        (hoNDArray< complext<double> >&, const complext<double>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator+=<double>(hoNDArray< std::complex<double> >&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator-=<double>(hoNDArray< std::complex<double> >&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator*=<double>(hoNDArray< std::complex<double> >&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator/=<double>(hoNDArray< std::complex<double> >&, const hoNDArray<double>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator+=<double>(hoNDArray< complext<double> >&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator-=<double>(hoNDArray< complext<double> >&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator*=<double>(hoNDArray< complext<double> >&, const hoNDArray<double>&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator/=<double>(hoNDArray< complext<double> >&, const hoNDArray<double>&);
+
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator+=<double>(hoNDArray< std::complex<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator-=<double>(hoNDArray< std::complex<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator*=<double>(hoNDArray< std::complex<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< std::complex<double> >& operator/=<double>(hoNDArray< std::complex<double> >&, const double&);
+
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator+=<double>(hoNDArray< complext<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator-=<double>(hoNDArray< complext<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator*=<double>(hoNDArray< complext<double> >&, const double&);
+    template EXPORTCPUCOREMATH hoNDArray< complext<double> >& operator/=<double>(hoNDArray< complext<double> >&, const double&);
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_operators.h b/toolboxes/core/cpu/arma_math/hoNDArray_operators.h
new file mode 100644
index 0000000..f555128
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_operators.h
@@ -0,0 +1,239 @@
+/** \file hoNDArray_operators.h
+    \brief Common element-wise arithmetic operators on the hoNDArray class.
+    
+    hoNDArray_operators.h defines element-wise arithmetic array operations on the hoNDArray class.
+    We define the common operators +=, -=, *= and \= for both array-array and array-constant operations.
+    We have deliberately omitted to define operator+, operator- etc. since this would require returning an hoNDArray,
+    in turn invoking an explicit memcpy by the assignment operator.
+    Batch mode functionality is provided.
+    The implementation is based on Armadillo.
+    This code is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double, std::complex<float>, std::complex<double>, 
+    Gadgetron::complext<float> and Gadgetron::complext<double>. 
+    Scalars can be applied to complex numbers of corresponding precision.
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "cpucore_math_export.h"
+
+namespace Gadgetron{
+
+  /**
+   * @brief Implementation of element-wise operator+= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator+= (hoNDArray<T> &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator+= (hoNDArray<T> &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator+= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator+= (hoNDArray< std::complex<T> >&x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator+= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator+= (hoNDArray< complext<T> >&x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator-= (hoNDArray<T> &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator-= (hoNDArray<T> &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator-= (hoNDArray< std::complex<T > > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator-= (hoNDArray< std::complex<T> >&x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator-= (hoNDArray< complext<T > > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator-= (hoNDArray< complext<T> >&x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator*= (hoNDArray<T> &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator*= (hoNDArray<T> &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator*= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator*= (hoNDArray< std::complex<T> > &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator*= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator*= (hoNDArray< complext<T> > &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator/= (hoNDArray<T> &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray<T>& operator/= (hoNDArray<T> &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator/= (hoNDArray< std::complex<T> > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< std::complex<T> >& operator/= (hoNDArray< std::complex<T> > &x, const T &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on two hoNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+ 
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator/= (hoNDArray< complext<T> > &x, const hoNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on a hoNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTCPUCOREMATH hoNDArray< complext<T> >& operator/= (hoNDArray< complext<T> > &x, const T &y);
+}
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_reductions.cpp b/toolboxes/core/cpu/arma_math/hoNDArray_reductions.cpp
new file mode 100644
index 0000000..329710b
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_reductions.cpp
@@ -0,0 +1,41 @@
+#include "hoNDArray_reductions.h"
+#include "hoArmadillo.h"
+
+namespace Gadgetron{
+
+template<class REAL> REAL max(hoNDArray<REAL>* data){
+	return as_arma_col(data).max();
+}
+template<class REAL> REAL min(hoNDArray<REAL>* data){
+	return as_arma_col(data).min();
+}
+
+
+template<class T> T mean(hoNDArray<T>* data){
+	return (typename stdType<T>::Type) arma::mean(as_arma_col(data));
+}
+
+
+template<class T> T sum(hoNDArray<T>* data){
+	return (typename stdType<T>::Type) arma::sum(as_arma_col(data));
+}
+
+
+template EXPORTCPUCOREMATH float max(hoNDArray<float>*);
+template EXPORTCPUCOREMATH float min(hoNDArray<float>*);
+template EXPORTCPUCOREMATH float mean(hoNDArray<float>*);
+template EXPORTCPUCOREMATH float sum(hoNDArray<float>*);
+
+template EXPORTCPUCOREMATH double max(hoNDArray<double>*);
+template EXPORTCPUCOREMATH double min(hoNDArray<double>*);
+template EXPORTCPUCOREMATH double mean(hoNDArray<double>*);
+template EXPORTCPUCOREMATH double sum(hoNDArray<double>*);
+
+
+template EXPORTCPUCOREMATH complext<double> mean(hoNDArray<complext<double> >*);
+template EXPORTCPUCOREMATH complext<double> sum(hoNDArray<complext<double> >*);
+
+template EXPORTCPUCOREMATH complext<float> mean(hoNDArray<complext<float> >*);
+template EXPORTCPUCOREMATH complext<float> sum(hoNDArray<complext<float> >*);
+}
+
diff --git a/toolboxes/core/cpu/arma_math/hoNDArray_reductions.h b/toolboxes/core/cpu/arma_math/hoNDArray_reductions.h
new file mode 100644
index 0000000..6ec4b45
--- /dev/null
+++ b/toolboxes/core/cpu/arma_math/hoNDArray_reductions.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "cpucore_math_export.h"
+
+namespace Gadgetron{
+
+  template<class REAL> EXPORTCPUCOREMATH REAL max(hoNDArray<REAL>* data);
+  template<class REAL> EXPORTCPUCOREMATH REAL min(hoNDArray<REAL>* data);
+  template<class T> EXPORTCPUCOREMATH T mean(hoNDArray<T>* data);
+  template<class T> EXPORTCPUCOREMATH T sum(hoNDArray<T>* data);
+}
diff --git a/toolboxes/core/cpu/cpucore_export.h b/toolboxes/core/cpu/cpucore_export.h
new file mode 100644
index 0000000..21c3b4e
--- /dev/null
+++ b/toolboxes/core/cpu/cpucore_export.h
@@ -0,0 +1,22 @@
+/** \file cpucore_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef CPUCORE_EXPORT_H_
+#define CPUCORE_EXPORT_H_
+
+#if defined (WIN32)
+    #ifdef BUILD_TOOLBOX_STATIC
+        #define EXPORTCPUCORE
+    #else
+        #if defined (__BUILD_GADGETRON_CPUCORE__) || defined (cpucore_EXPORTS)
+            #define EXPORTCPUCORE __declspec(dllexport)
+        #else
+            #define EXPORTCPUCORE __declspec(dllimport)
+        #endif
+    #endif
+#else
+#define EXPORTCPUCORE
+#endif
+
+#endif /* CPUCORE_EXPORT_H_ */
diff --git a/toolboxes/core/cpu/ho2DArray.h b/toolboxes/core/cpu/ho2DArray.h
new file mode 100644
index 0000000..d913c60
--- /dev/null
+++ b/toolboxes/core/cpu/ho2DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho2DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho2DArray();
+    ho2DArray(size_t sx, size_t sy);
+    ho2DArray(std::vector<size_t> *dimensions);
+    ho2DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho2DArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+    ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho2DArray();
+
+    ho2DArray(const ho2DArray<T>& a);
+    ho2DArray<T>& operator=(const ho2DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy);
+    virtual bool createArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x , size_t y);
+    const T& operator()(size_t x , size_t y) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T** accesser_;
+};
+
+}
+
+#include <ho2DArray.hxx>
diff --git a/toolboxes/core/cpu/ho2DArray.hxx b/toolboxes/core/cpu/ho2DArray.hxx
new file mode 100644
index 0000000..abcb853
--- /dev/null
+++ b/toolboxes/core/cpu/ho2DArray.hxx
@@ -0,0 +1,261 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho2DArray<T>::ho2DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(size_t sx, size_t sy)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==2);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::~ho2DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>::ho2DArray(const ho2DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho2DArray<T>& ho2DArray<T>::operator=(const ho2DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho2DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho2DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho2DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho2DArray<T>::createArray(size_t sx, size_t sy)
+{
+    try
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("ho2DArray<T>::createArray(size_t sx, size_t sy) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho2DArray<T>::createArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(2);
+        dim[0] = sx;
+        dim[1] = sy;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("ho2DArray<T>::createArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho2DArray<T>::operator()(size_t x , size_t y)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1]);
+    return accesser_[y][x];
+}
+
+template <typename T> 
+inline const T& ho2DArray<T>::operator()(size_t x , size_t y) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1]);
+    return accesser_[y][x];
+}
+
+template <typename T> 
+bool ho2DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+
+            accesser_ = new T*[sy];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = data_;
+            for (size_t y=1; y<sy; y++)
+            {
+                accesser_[y] = accesser_[y-1] + sx;
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho2DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho2DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho2DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho2DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y;
+    os << "-------------------------------------------" << std::endl;
+    for (y=0; y<(*dimensions_)[1]; y++) 
+    {
+        os << "y " << y << "\t";
+        for (x=0; x<(*dimensions_)[0]; x++)
+        {
+            os << (*this)(x,y) << "\t";
+        }
+        os << std::endl;
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho3DArray.h b/toolboxes/core/cpu/ho3DArray.h
new file mode 100644
index 0000000..7109eb3
--- /dev/null
+++ b/toolboxes/core/cpu/ho3DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho3DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho3DArray();
+    ho3DArray(size_t sx, size_t sy, size_t sz);
+    ho3DArray(std::vector<size_t> *dimensions);
+    ho3DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho3DArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+    ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho3DArray();
+
+    ho3DArray(const ho3DArray<T>& a);
+    ho3DArray<T>& operator=(const ho3DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z);
+    const T& operator()(size_t x, size_t y, size_t z) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T*** accesser_;
+};
+
+}
+
+#include <ho3DArray.hxx>
diff --git a/toolboxes/core/cpu/ho3DArray.hxx b/toolboxes/core/cpu/ho3DArray.hxx
new file mode 100644
index 0000000..ebac434
--- /dev/null
+++ b/toolboxes/core/cpu/ho3DArray.hxx
@@ -0,0 +1,287 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho3DArray<T>::ho3DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(size_t sx, size_t sy, size_t sz)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==3);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::~ho3DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>::ho3DArray(const ho3DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho3DArray<T>& ho3DArray<T>::operator=(const ho3DArray& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho3DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho3DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho3DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz)
+{
+    try
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(3);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho3DArray<T>::createArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho3DArray<T>::operator()(size_t x , size_t y, size_t z)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2]);
+    return accesser_[z][y][x];
+}
+
+template <typename T> 
+inline const T& ho3DArray<T>::operator()(size_t x , size_t y, size_t z) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[[2]);
+    return accesser_[z][y][x];
+}
+
+template <typename T> 
+bool ho3DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+
+            size_t y, z;
+
+            accesser_ = new T**[sz];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T*[sy*sz];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (z = 1; z < sz; z++)
+            {
+                accesser_[z] = accesser_[z-1] + sy;
+            }
+
+            accesser_[0][0] = data_;
+
+            for (z=0; z<sz; z++)
+            {
+                for (y=0; y<sy; y++)
+                {
+                    accesser_[z][y] = accesser_[0][0] + (z*sy+y)*sx;
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho3DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho3DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho3DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho3DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z;
+    os << "-------------------------------------------" << std::endl;
+    for (z=0; z<(*dimensions_)[2]; z++) 
+    {
+        os << "Array3D (:, :, " << z << ") = " << std::endl;
+        for (y=0; y<(*dimensions_)[1]; y++) 
+        {
+            os << "y " << y << "\t";
+            for (x=0; x<(*dimensions_)[0]; x++)
+            {
+                os << (*this)(x,y,z) << "\t";
+            }
+            os << std::endl;
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho4DArray.h b/toolboxes/core/cpu/ho4DArray.h
new file mode 100644
index 0000000..28c1225
--- /dev/null
+++ b/toolboxes/core/cpu/ho4DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho4DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho4DArray();
+    ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss);
+    ho4DArray(std::vector<size_t> *dimensions);
+    ho4DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct = false);
+    ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho4DArray();
+
+    ho4DArray(const ho4DArray<T>& a);
+    ho4DArray<T>& operator=(const ho4DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T**** accesser_;
+};
+
+}
+
+#include <ho4DArray.hxx>
diff --git a/toolboxes/core/cpu/ho4DArray.hxx b/toolboxes/core/cpu/ho4DArray.hxx
new file mode 100644
index 0000000..de1e666
--- /dev/null
+++ b/toolboxes/core/cpu/ho4DArray.hxx
@@ -0,0 +1,313 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho4DArray<T>::ho4DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==4);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::~ho4DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>::ho4DArray(const ho4DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho4DArray<T>& ho4DArray<T>::operator=(const ho4DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho4DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho4DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho4DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss)
+{
+    try
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(4);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+
+        this->create(&dim, data, delete_data_on_destruct);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho4DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho4DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3]);
+    return accesser_[s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho4DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3]);
+    return accesser_[s][z][y][x];
+}
+
+template <typename T> 
+bool ho4DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+
+            size_t y, z, s;
+
+            accesser_ = new T***[ss];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T**[sz*ss];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (s=1; s<ss; s++)
+            {
+                accesser_[s] = accesser_[s-1] + sz;
+            }
+
+            accesser_[0][0] = new T*[sy*sz*ss];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (s=0; s<ss; s++)
+            {
+                for (z=0; z<sz; z++)
+                {
+                    accesser_[s][z] = accesser_[0][0] + s*sz*sy + z*sy;
+                }
+            }
+
+            accesser_[0][0][0] = data_;
+            for (s=0; s<ss; s++)
+            {
+                for (z=0; z<sz; z++)
+                {
+                    for (y=0; y<sy; y++)
+                    {
+                        accesser_[s][z][y] = accesser_[0][0][0] + s*sz*sy*sx + z*sy*sx + y*sx;
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho4DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho4DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho4DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho4DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s;
+    os << "-------------------------------------------" << std::endl;
+    for (s=0; s<(*dimensions_)[3]; s++) 
+    {
+        for (z=0; z<(*dimensions_)[2]; z++) 
+        {
+            os << "ho4DArray (:, :, " << z << ", " << s << ") = " << std::endl;
+            for (y=0; y<(*dimensions_)[1]; y++) 
+            {
+                os << "y " << y << "\t";
+                for (x=0; x<(*dimensions_)[0]; x++)
+                {
+                    os << (*this)(x,y,z,s) << "\t";
+                }
+                os << std::endl;
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho5DArray.h b/toolboxes/core/cpu/ho5DArray.h
new file mode 100644
index 0000000..860a687
--- /dev/null
+++ b/toolboxes/core/cpu/ho5DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho5DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho5DArray();
+    ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp);
+    ho5DArray(std::vector<size_t> *dimensions);
+    ho5DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct = false);
+    ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho5DArray();
+
+    ho5DArray(const ho5DArray<T>& a);
+    ho5DArray<T>& operator=(const ho5DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s, size_t p);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s, size_t p) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T***** accesser_;
+};
+
+}
+
+#include <ho5DArray.hxx>
diff --git a/toolboxes/core/cpu/ho5DArray.hxx b/toolboxes/core/cpu/ho5DArray.hxx
new file mode 100644
index 0000000..dd9cf54
--- /dev/null
+++ b/toolboxes/core/cpu/ho5DArray.hxx
@@ -0,0 +1,345 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho5DArray<T>::ho5DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+    dim[4] = sp;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, sp, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==5);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::~ho5DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>::ho5DArray(const ho5DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho5DArray<T>& ho5DArray<T>::operator=(const ho5DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho5DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho5DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho5DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp)
+{
+    try
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(5);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+
+        this->create(&dim);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho5DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho5DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4]);
+    return accesser_[p][s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho5DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4]);
+    return accesser_[p][s][z][y][x];
+}
+
+template <typename T> 
+bool ho5DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+            size_t sp = (*dimensions_)[4];
+
+            size_t y, z, s, p;
+
+            accesser_ = new T****[sp];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T***[ss*sp];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+
+            for (p=1; p<sp; p++)
+            {
+                accesser_[p] = accesser_[p-1] + ss;
+            }
+
+            accesser_[0][0] = new T**[sz*ss*sp];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (p=0; p<sp; p++)
+            {
+                for (s=0; s<ss; s++)
+                {
+                    accesser_[p][s] = accesser_[0][0] + p*ss*sz + s*sz;
+                }
+            }
+
+            accesser_[0][0][0] = new T*[sy*sz*ss*sp];
+            if (accesser_[0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (p=0; p<sp; p++)
+            {
+                for (s=0; s<ss; s++)
+                {
+                    for (z=0; z<sz; z++)
+                    {
+                        accesser_[p][s][z] = accesser_[0][0][0] + p*ss*sz*sy + s*sz*sy + z*sy;
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0] = data_;
+            for (p=0; p<sp; p++)
+            {
+                for (s=0; s<ss; s++)
+                {
+                    for (z=0; z<sz; z++)
+                    {
+                        for (y=0; y<sy; y++)
+                        {
+                            accesser_[p][s][z][y] = accesser_[0][0][0][0] + p*ss*sz*sy*sx + s*sz*sy*sx + z*sy*sx+y*sx;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho5DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho5DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0][0];
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho5DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho5DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s, p;
+    os << "-------------------------------------------" << std::endl;
+    for (p=0; p<(*dimensions_)[4]; p++) 
+    {
+        for (s=0; s<(*dimensions_)[3]; s++) 
+        {
+            for (z=0; z<(*dimensions_)[2]; z++) 
+            {
+                os << "ho5DArray (:, :, " << z << ", " << s << ", " << p << ") = " << std::endl;
+                for (y=0; y<(*dimensions_)[1]; y++) 
+                {
+                    os << "y " << y << "\t";
+                    for (x=0; x<(*dimensions_)[0]; x++)
+                    {
+                        os << (*this)(x,y,z,s,p) << "\t";
+                    }
+                    os << std::endl;
+                }
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho6DArray.h b/toolboxes/core/cpu/ho6DArray.h
new file mode 100644
index 0000000..6fc6a90
--- /dev/null
+++ b/toolboxes/core/cpu/ho6DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho6DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho6DArray();
+    ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr);
+    ho6DArray(std::vector<size_t> *dimensions);
+    ho6DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct = false);
+    ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho6DArray();
+
+    ho6DArray(const ho6DArray<T>& a);
+    ho6DArray<T>& operator=(const ho6DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s, size_t p, size_t r) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T****** accesser_;
+};
+
+}
+
+#include <ho6DArray.hxx>
diff --git a/toolboxes/core/cpu/ho6DArray.hxx b/toolboxes/core/cpu/ho6DArray.hxx
new file mode 100644
index 0000000..61fe9da
--- /dev/null
+++ b/toolboxes/core/cpu/ho6DArray.hxx
@@ -0,0 +1,392 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho6DArray<T>::ho6DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+    dim[4] = sp;
+    dim[5] = sr;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, sp, sr, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions_->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==6);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::~ho6DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>::ho6DArray(const ho6DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho6DArray<T>& ho6DArray<T>::operator=(const ho6DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho6DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho6DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho6DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr)
+{
+    try
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(6);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+
+        this->create(&dim);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho6DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho6DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5]);
+    return accesser_[r][p][s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho6DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5]);
+    return accesser_[r][p][s][z][y][x];
+}
+
+template <typename T> 
+bool ho6DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+            size_t sp = (*dimensions_)[4];
+            size_t sr = (*dimensions_)[5];
+
+            size_t y, z, s, p, r;
+
+            accesser_ = new T*****[sr];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T****[sp*sr];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (r=1; r<sr; r++)
+            {
+                accesser_[r] = accesser_[r-1] + sp;
+            }
+
+            accesser_[0][0] = new T***[ss*sp*sr];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    accesser_[r][p] = accesser_[0][0] + r*sp*ss + p*ss;
+                }
+            }
+
+            accesser_[0][0][0] = new T**[sz*ss*sp*sr];
+            if (accesser_[0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    for (s=0; s<ss; s++)
+                    {
+                        accesser_[r][p][s] = accesser_[0][0][0] 
+                                                + r*sp*ss*sz 
+                                                + p*ss*sz 
+                                                + s*sz;
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0] = new T*[sy*sz*ss*sp*sr];
+            if (accesser_[0][0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0][0];
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    for (s=0; s<ss; s++)
+                    {
+                        for (z=0; z<sz; z++)
+                        {
+                            accesser_[r][p][s][z] = accesser_[0][0][0][0] 
+                                                        + r*sp*ss*sz*sy 
+                                                        + p*ss*sz*sy 
+                                                        + s*sz*sy 
+                                                        + z*sy;
+                        }
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0][0] = data_;
+            for (r=0; r<sr; r++)
+            {
+                for (p=0; p<sp; p++)
+                {
+                    for (s=0; s<ss; s++)
+                    {
+                        for (z=0; z<sz; z++)
+                        {
+                            for (y=0; y<sy; y++)
+                            {
+                                accesser_[r][p][s][z][y] = accesser_[0][0][0][0][0] 
+                                                                + r*sp*ss*sz*sy*sx 
+                                                                + p*ss*sz*sy*sx 
+                                                                + s*sz*sy*sx 
+                                                                + z*sy*sx 
+                                                                + y*sx;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho6DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho6DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0][0][0];
+            delete [] accesser_[0][0][0];
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho6DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho6DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s, p, r;
+    os << "-------------------------------------------" << std::endl;
+    for (r=0; r<(*dimensions_)[5]; r++) 
+    {
+        for (p=0; p<(*dimensions_)[4]; p++) 
+        {
+            for (s=0; s<(*dimensions_)[3]; s++) 
+            {
+                for (z=0; z<(*dimensions_)[2]; z++) 
+                {
+                    os << "ho6DArray (:, :, " << z << ", " << s << ", " << p << ", " << r << ") = " << std::endl;
+                    for (y=0; y<(*dimensions_)[1]; y++) 
+                    {
+                        os << "y " << y << "\t";
+                        for (x=0; x<(*dimensions_)[0]; x++)
+                        {
+                            os << (*this)(x,y,z,s,p,r) << "\t";
+                        }
+                        os << std::endl;
+                    }
+                }
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/ho7DArray.h b/toolboxes/core/cpu/ho7DArray.h
new file mode 100644
index 0000000..1e41b99
--- /dev/null
+++ b/toolboxes/core/cpu/ho7DArray.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "hoNDArray.h"
+
+namespace Gadgetron{
+
+template <class T> class ho7DArray : public hoNDArray<T>
+{
+public:
+
+    typedef hoNDArray<T> BaseClass;
+
+    ho7DArray();
+    ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa);
+    ho7DArray(std::vector<size_t> *dimensions);
+    ho7DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct = false);
+    ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+    ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~ho7DArray();
+
+    ho7DArray(const ho7DArray<T>& a);
+    ho7DArray<T>& operator=(const ho7DArray<T>& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa);
+    virtual bool createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a);
+    const T& operator()(size_t x , size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    bool init_accesser();
+    bool release_accesser();
+
+    T******* accesser_;
+};
+
+}
+
+#include <ho7DArray.hxx>
diff --git a/toolboxes/core/cpu/ho7DArray.hxx b/toolboxes/core/cpu/ho7DArray.hxx
new file mode 100644
index 0000000..4c77b2e
--- /dev/null
+++ b/toolboxes/core/cpu/ho7DArray.hxx
@@ -0,0 +1,427 @@
+
+namespace Gadgetron{
+
+template <typename T> 
+ho7DArray<T>::ho7DArray()
+: BaseClass(), accesser_(NULL)
+{
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa)
+: accesser_(NULL)
+{
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = ss;
+    dim[4] = sp;
+    dim[5] = sr;
+    dim[6] = sa;
+
+    this->create(&dim);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(std::vector<size_t> *dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions)
+: BaseClass(dimensions), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+: BaseClass(dimensions, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct)
+: BaseClass(sx, sy, sz, ss, sp, sr, sa, data, delete_data_on_destruct), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(dimensions_->size()==7);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::~ho7DArray()
+{
+    GADGET_CHECK_THROW(release_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>::ho7DArray(const ho7DArray<T>& a)
+: BaseClass(a), accesser_(NULL)
+{
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+ho7DArray<T>& ho7DArray<T>::operator=(const ho7DArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 )
+    {
+        this->clear();
+        GADGET_CHECK_THROW(init_accesser());
+        return *this;
+    }
+
+    if (this->dimensions_equal(&rhs)) 
+    {
+        memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else
+    {
+        this->deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_ = rhs.dimensions_;
+        this->allocate_memory();
+        memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+
+        GADGET_CHECK_THROW(init_accesser());
+    }
+
+    return *this;
+}
+
+template <typename T> 
+void ho7DArray<T>::create(std::vector<size_t>& dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho7DArray<T>::create(std::vector<size_t> *dimensions)
+{
+    BaseClass::create(dimensions);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+void ho7DArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct)
+{
+    BaseClass::create(dimensions, data, delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+bool ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa)
+{
+    try
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+        dim[6] = sa;
+
+        if ( !this->dimensions_equal(&dim) )
+        {
+            this->create(&dim);
+            GADGET_CHECK_RETURN_FALSE(init_accesser());
+        }
+        else
+        {
+            memset(this->data_, 0, sizeof(T)*this->elements_);
+        }
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct)
+{
+    try
+    {
+        std::vector<size_t> dim(7);
+        dim[0] = sx;
+        dim[1] = sy;
+        dim[2] = sz;
+        dim[3] = ss;
+        dim[4] = sp;
+        dim[5] = sr;
+        dim[6] = sa;
+
+        this->create(&dim);
+        GADGET_CHECK_RETURN_FALSE(init_accesser());
+    }
+    catch(...)
+    {
+        GADGET_THROW("ho7DArray<T>::createArray(size_t sx, size_t sy, size_t sz, size_t ss, size_t sp, size_t sr, size_t sa, T* data, bool delete_data_on_destruct) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline T& ho7DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a)
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5] && a<(*dimensions_)[6]);
+    return accesser_[a][r][p][s][z][y][x];
+}
+
+template <typename T> 
+inline const T& ho7DArray<T>::operator()(size_t x, size_t y, size_t z, size_t s, size_t p, size_t r, size_t a) const
+{
+    GADGET_DEBUG_CHECK_THROW(x<(*dimensions_)[0] && y<(*dimensions_)[1] && z<(*dimensions_)[2] && s<(*dimensions_)[3] && p<(*dimensions_)[4] && r<(*dimensions_)[5] && a<(*dimensions_)[6]);
+    return accesser_[a][r][p][s][z][y][x];
+}
+
+template <typename T> 
+bool ho7DArray<T>::init_accesser()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(release_accesser());
+
+        if ( elements_ > 0 )
+        {
+            size_t sx = (*dimensions_)[0];
+            size_t sy = (*dimensions_)[1];
+            size_t sz = (*dimensions_)[2];
+            size_t ss = (*dimensions_)[3];
+            size_t sp = (*dimensions_)[4];
+            size_t sr = (*dimensions_)[5];
+            size_t sa = (*dimensions_)[6];
+
+            size_t y, z, s, p, r, a;
+
+            accesser_ = new T******[sa];
+            if( accesser_ == NULL) return false;
+
+            accesser_[0] = new T*****[sr*sa];
+            if( accesser_[0] == NULL)
+            {
+                delete [] accesser_;
+                return false;
+            }
+            for (a=1; a<sa; a++)
+            {
+                accesser_[a] = accesser_[a-1] + sr;
+            }
+
+            accesser_[0][0] = new T****[sp*sr*sa];
+            if (accesser_[0][0] == NULL)
+            {
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    accesser_[a][r] = accesser_[0][0] + a*sr*sp + r*sp;
+                }
+            }
+
+            accesser_[0][0][0] = new T***[ss*sp*sr*sa];
+            if (accesser_[0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        accesser_[a][r][p] = accesser_[0][0][0] 
+                                                + a*sr*sp*ss 
+                                                + r*sp*ss 
+                                                + p*ss;
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0] = new T**[sz*ss*sp*sr*sa];
+            if (accesser_[0][0][0][0] == NULL)
+            {
+                delete [] accesser_[0][0][0];
+                delete [] accesser_[0][0];
+                delete [] accesser_[0];
+                delete [] accesser_;
+                return false;
+            }
+
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        for (s=0; s<ss; s++)
+                        {
+                            accesser_[a][r][p][s] = accesser_[0][0][0][0] 
+                                                        + a*sr*sp*ss*sz 
+                                                        + r*sp*ss*sz 
+                                                        + p*ss*sz 
+                                                        + s*sz;
+                        }
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0][0] = new T*[sy*sz*ss*sp*sr*sa];
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        for (s=0; s<ss; s++)
+                        {
+                            for (z=0; z<sz; z++)
+                            {
+                                accesser_[a][r][p][s][z] = accesser_[0][0][0][0][0] 
+                                                                + a*sr*sp*ss*sz*sy 
+                                                                + r*sp*ss*sz*sy 
+                                                                + p*ss*sz*sy 
+                                                                + s*sz*sy 
+                                                                + z*sy;
+                            }
+                        }
+                    }
+                }
+            }
+
+            accesser_[0][0][0][0][0][0] = data_;
+            for (a=0; a<sa; a++)
+            {
+                for (r=0; r<sr; r++)
+                {
+                    for (p=0; p<sp; p++)
+                    {
+                        for (s=0; s<ss; s++)
+                        {
+                            for (z=0; z<sz; z++)
+                            {
+                                for (y=0; y<sy; y++)
+                                {
+                                    accesser_[a][r][p][s][z][y] = accesser_[0][0][0][0][0][0] 
+                                                                    + a*sr*sp*ss*sz*sy*sx 
+                                                                    + r*sp*ss*sz*sy*sx 
+                                                                    + p*ss*sz*sy*sx 
+                                                                    + s*sz*sy*sx 
+                                                                    + z*sy*sx
+                                                                    + y*sx;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            accesser_ = NULL;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho7DArray<T>::init_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool ho7DArray<T>::release_accesser()
+{
+    try
+    {
+        if (accesser_ != NULL)
+        {
+            delete [] accesser_[0][0][0][0][0];
+            delete [] accesser_[0][0][0][0];
+            delete [] accesser_[0][0][0];
+            delete [] accesser_[0][0];
+            delete [] accesser_[0];
+            delete [] accesser_;
+        }
+        accesser_ = NULL;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in ho7DArray<T>::release_accesser() ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void ho7DArray<T>::print(std::ostream& os) const
+{
+    BaseClass::print(os);
+    size_t x, y, z, s, p, r, a;
+    os << "-------------------------------------------" << std::endl;
+    for (a=0; a<(*dimensions_)[6]; a++) 
+    {
+        for (r=0; r<(*dimensions_)[5]; r++) 
+        {
+            for (p=0; p<(*dimensions_)[4]; p++) 
+            {
+                for (s=0; s<(*dimensions_)[3]; s++) 
+                {
+                    for (z=0; z<(*dimensions_)[2]; z++) 
+                    {
+                        os << "ho7DArray (:, :, " << z << ", " << s << ", " << p << ", " << r << ", " << a << ") = " << std::endl;
+                        for (y=0; y<(*dimensions_)[1]; y++) 
+                        {
+                            os << "y " << y << "\t";
+                            for (x=0; x<(*dimensions_)[0]; x++)
+                            {
+                                os << (*this)(x,y,z,s,p,r,a) << "\t";
+                            }
+                            os << std::endl;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    os << "-------------------------------------------" << std::endl;
+}
+
+}
diff --git a/toolboxes/core/cpu/hoMatrix.cpp b/toolboxes/core/cpu/hoMatrix.cpp
new file mode 100644
index 0000000..b19ed8c
--- /dev/null
+++ b/toolboxes/core/cpu/hoMatrix.cpp
@@ -0,0 +1,1309 @@
+
+namespace Gadgetron
+{
+
+template <typename T> 
+hoMatrix<T>::hoMatrix() : BaseClass(1, 1)
+{
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(size_t rows, size_t cols) : BaseClass(rows, cols)
+{
+    this->fill(T(0));
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct)
+{
+    std::vector<size_t> dim(2);
+    dim[0] = rows;
+    dim[1] = cols;
+    this->create(&dim,data,delete_data_on_destruct);
+    GADGET_CHECK_THROW(this->init_accesser());
+}
+
+template <typename T> 
+hoMatrix<T>::~hoMatrix()
+{
+
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(const hoMatrix<T>& a) : BaseClass(a)
+{
+}
+
+template <typename T> 
+hoMatrix<T>& hoMatrix<T>::operator=(const hoMatrix& rhs)
+{
+    if ( this == &rhs ) return *this;
+    BaseClass::operator=(rhs);
+    return *this;
+}
+
+template <typename T> 
+bool hoMatrix<T>::createMatrix(size_t rows, size_t cols)
+{
+    return this->createArray(rows, cols);
+}
+
+template <typename T> 
+bool hoMatrix<T>::createMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct)
+{
+    return this->createArray(rows, cols, data, delete_data_on_destruct);
+}
+
+template <typename T> 
+inline T& hoMatrix<T>::operator()(size_t r, size_t c)
+{
+    GADGET_DEBUG_CHECK_THROW(c>=0 && r>=0 && r<(*dimensions_)[0] && c<(*dimensions_)[1]);
+    return accesser_[c][r];
+}
+
+template <typename T> 
+inline const T& hoMatrix<T>::operator()(size_t r, size_t c) const
+{
+    GADGET_DEBUG_CHECK_THROW(c>=0 && r>=0 && c<(*dimensions_)[0] && r<(*dimensions_)[1]);
+    return accesser_[c][r];
+}
+
+template <typename T> 
+inline size_t hoMatrix<T>::rows() const
+{
+    if ( dimensions_->empty() ) return 0;
+    return (*dimensions_)[0];
+}
+
+template <typename T> 
+inline size_t hoMatrix<T>::cols() const
+{
+    if ( dimensions_->empty() ) return 0;
+    return (*dimensions_)[1];
+}
+
+template <typename T> 
+bool hoMatrix<T>::upperTri(const T& v)
+{
+    try
+    {
+        size_t r, c;
+        for (c=0; c<(*dimensions_)[1]; c++)
+        {
+            for (r=0; r<(*dimensions_)[0]; r++)
+            {
+                if ( c > r )
+                {
+                    (*this)(r, c) = v;
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::upperTri(const T& v) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::lowerTri(const T& v)
+{
+    try
+    {
+        size_t r, c;
+        for (c=0; c<(*dimensions_)[1]; c++)
+        {
+            for (r=0; r<(*dimensions_)[0]; r++)
+            {
+                if ( r > c )
+                {
+                    (*this)(r, c) = v;
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::lowerTri(const T& v) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::sumOverRow(hoNDArray<T>& res) const
+{
+    try
+    {
+        size_t ROW = rows();
+        size_t COL = cols();
+
+        if ( res.get_number_of_elements() != ROW )
+        {
+            res.create(ROW);
+        }
+
+        T* pRes = res.begin();
+
+        size_t r, c;
+
+        for ( r=0; r<ROW; r++ )
+        {
+            pRes[r] = 0;
+        }
+
+        for ( c=0; c<COL; c++ )
+        {
+            for ( r=0; r<ROW; r++ )
+            {
+                // res(r) += (*this)(r, c);
+                pRes[r] += this->data_[r+c*ROW];
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::sumOverRow(hoNDArray<T>& r) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::sumOverCol(hoNDArray<T>& res) const
+{
+    try
+    {
+        size_t ROW = rows();
+        size_t COL = cols();
+
+        if ( res.get_number_of_elements() != COL )
+        {
+            res.create(COL);
+        }
+
+        T* pRes = res.begin();
+
+        size_t r;
+        long long c;
+
+        for ( c=0; c<(long long)COL; c++ )
+        {
+            pRes[c] = 0;
+        }
+
+        #pragma omp parallel for default(none) private(c, r) shared(COL, ROW, pRes) if ( COL > 16 )
+        for ( c=0; c<(long long)COL; c++ )
+        {
+            T v(0);
+            for ( r=0; r<ROW; r++ )
+            {
+                v += this->data_[r+c*ROW];
+            }
+            pRes[c] = v;
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::sumOverCol(hoNDArray<T>& r) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::subMatrix(Self& res, size_t startR, size_t endR, size_t startC, size_t endC) const
+{
+    try
+    {
+        size_t ROW = rows();
+        size_t COL = cols();
+
+        GADGET_CHECK_RETURN_FALSE(startR>=0&&startR<ROW);
+        GADGET_CHECK_RETURN_FALSE(startC>=0&&startC<COL);
+        GADGET_CHECK_RETURN_FALSE(endR>=0&&endR<ROW);
+        GADGET_CHECK_RETURN_FALSE(endC>=0&&endC<COL);
+        GADGET_CHECK_RETURN_FALSE(endR>=startR);
+        GADGET_CHECK_RETURN_FALSE(endC>=startC);
+
+        GADGET_CHECK_RETURN_FALSE(res.createMatrix(endR-startR+1, endC-startC+1));
+
+        size_t r, c;
+        for ( r=startR; r<=endR; r++ )
+        {
+            for ( c=startC; c<=endC; c++ )
+            {
+                res(r-startR, c-startC) = (*this)(r, c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::subMatrix(Self& res, size_t startR, size_t endR, size_t startC, size_t endC) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::operator == (const Self& m) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->dimensions_equal(&m));
+    for ( size_t i=0; i<elements_; i++ )
+    { 
+        if (std::abs(data_[i]-m.data_[i])>DBL_EPSILON)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::operator != (const Self& m) const
+{
+    return !(*this==m);
+}
+
+template <typename T> 
+void hoMatrix<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os.unsetf(std::ios::scientific);
+
+    os << "hoMatrix (row X col): " << this->rows() << " X " << this->cols() << " : " << std::string(typeid(T).name()) << endl;
+    size_t r, c;
+    for (r=0; r<(*dimensions_)[0]; r++) 
+    {
+        os << "r " << r << ":\t";
+        for (c=0; c<(*dimensions_)[1]; c++)
+        {
+            os << setprecision(10) << (*this)(r,c) << "\t";
+        }
+        os << endl; 
+    }
+}
+
+template <typename T> 
+bool copyL2U(hoMatrix<T>& A)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t C = A.cols();
+
+        size_t row, col;
+        for(row=0; row<R; row++) 
+        {
+            for(col=0; col<row; col++ )
+            {
+                A(col, row) = A(row, col);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyL2U(hoMatrix<T>& A) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyL2U(hoMatrix<T>& A, bool conj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t row, col;
+
+        if ( conj )
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=0; col<row; col++ )
+                {
+                    A(col, row) = std::conj(A(row, col));
+                }
+            }
+        }
+        else
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=0; col<row; col++ )
+                {
+                    A(col, row) = A(row, col);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyL2U(hoMatrix<T>& A, bool conj) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyU2L(hoMatrix<T>& A)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t C = A.cols();
+
+        size_t row, col;
+        for(row=0; row<R; row++) 
+        {
+            for(col=row+1; col<C; col++ )
+            {
+                A(col, row) = A(row, col);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyU2L(hoMatrix<T>& A) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyU2L(hoMatrix<T>& A, bool conj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        size_t R = A.rows();
+        size_t C = A.cols();
+
+        size_t row, col;
+
+        if ( conj )
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=row+1; col<C; col++ )
+                {
+                    A(col, row) = std::conj(A(row, col));
+                }
+            }
+        }
+        else
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=row+1; col<C; col++ )
+                {
+                    A(col, row) = A(row, col);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyU2L(hoMatrix<T>& A, bool conj) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool trans(const hoMatrix<T>& A, hoMatrix<T>& AT)
+{
+    try
+    {
+        if ( A.get_number_of_elements() == 0 ) return true;
+
+        if ( AT.rows()!=A.cols() || AT.cols()!=A.rows() )
+        {
+            AT.createMatrix(A.cols(), A.rows());
+        }
+
+        long long r, c;
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(r, c)
+        #else
+            #pragma omp parallel for default(none) private(r, c) shared(A, AT)
+        #endif
+        for ( c=0; c<(long long)A.cols(); c++ )
+        {
+            for ( r=0; r<(long long)A.rows(); r++ )
+            {
+                AT(c,r) = A(r,c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in trans(const hoMatrix<T>& A, hoMatrix<T>& AT) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool conjugatetrans(const hoMatrix<T>& A, hoMatrix<T>& AH)
+{
+    try
+    {
+        if ( A.get_number_of_elements() == 0 ) return true;
+
+        if ( AH.rows()!=A.cols() || AH.cols()!=A.rows() )
+        {
+            AH.createMatrix(A.cols(), A.rows());
+        }
+
+        long long r, c;
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(r, c)
+        #else
+            #pragma omp parallel for default(none) private(r, c) shared(A, AH)
+        #endif
+        for ( c=0; c<(long long)A.cols(); c++ )
+        {
+            for ( r=0; r<(long long)A.rows(); r++ )
+            {
+                AH(c,r) = std::conj(A(r,c));
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in conjugatetrans(const hoMatrix<T>& A, hoMatrix<T>& AH) ... ");
+        return false;
+    }
+    return true;
+}
+
+inline bool conjugatetrans(const hoMatrix<float>& A, hoMatrix<float>& AH)
+{
+    return trans(A, AH);
+}
+
+inline bool conjugatetrans(const hoMatrix<double>& A, hoMatrix<double>& AH)
+{
+    return trans(A, AH);
+}
+
+// C = A*B
+template<typename T> 
+bool GeneralMatrixProduct(hoNDArray<T>& C, const hoNDArray<T>& A, bool transA, const hoNDArray<T>& B, bool transB)
+{
+    try
+    {
+        size_t M = A.get_size(0);
+        size_t K = A.get_size(1);
+        if ( transA )
+        { 
+            M = A.get_size(1);
+            K = A.get_size(0);
+        }
+
+        size_t K2 = B.get_size(0);
+        size_t N = B.get_size(1);
+        if ( transB )
+        {
+            K2 = B.get_size(1);
+            N = B.get_size(0);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        const T* pA = A.begin();
+        const T* pB = B.begin();
+        T* pC = C.begin();
+
+        size_t m, n, k;
+
+        if ( !transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += pA[m+k*M]*pB[k+n*K];
+                    }
+                }
+            }
+        }
+
+        if ( transA && !transB )
+        {
+            for ( m=0; m<M; m++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    pC[m+n*M] = 0;
+                    for ( k=0; k<K; k++ )
+                    {
+                        pC[m+n*M] += std::conj(pA[k+m*K])*pB[k+n*K];
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GeneralMatrixProduct(hoNDArray<T>& C, const hoNDArray<T>& A, bool transA, const hoNDArray<T>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+// following matrix computation calls MKL functions
+#ifdef USE_MKL
+
+template<typename T> 
+bool GeneralMatrixProduct_gemm(hoNDArray<T>& C, 
+                            const hoNDArray<T>& A, bool transA, 
+                            const hoNDArray<T>& B, bool transB)
+{
+    try
+    {
+        char TA, TB;
+
+        MKL_INT lda = A.get_size(0);
+        MKL_INT ldb = B.get_size(0);
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        MKL_INT M = A.get_size(0);
+        MKL_INT K = A.get_size(1);
+        if ( transA )
+        { 
+            M = A.get_size(1);
+            K = A.get_size(0);
+        }
+
+        MKL_INT K2 = B.get_size(0);
+        MKL_INT N = B.get_size(1);
+        if ( transB )
+        {
+            K2 = B.get_size(1);
+            N = B.get_size(0);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.get_size(0)!=M) || (C.get_size(1)!=N) )
+        {
+            C.create(M, N);
+        }
+
+        T* pC = C.begin();
+        MKL_INT ldc = C.get_size(0);
+
+        if ( typeid(T)==typeid(float) )
+        {
+            float alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'T';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'T';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                sgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const float*>(pA), &lda, reinterpret_cast<const float*>(pB), &ldb, &beta, reinterpret_cast<float*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                sgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const float*>(pATmp), &lda, reinterpret_cast<const float*>(pB), &ldb, &beta, reinterpret_cast<float*>(pC), &ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            double alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'T';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'T';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                dgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const double*>(pA), &lda, reinterpret_cast<const double*>(pB), &ldb, &beta, reinterpret_cast<double*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                dgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const double*>(pATmp), &lda, reinterpret_cast<const double*>(pB), &ldb, &beta, reinterpret_cast<double*>(pC), &ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            GT_Complex8 alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'C';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'C';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                cgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex8*>(&alpha), reinterpret_cast<const MKL_Complex8*>(pA), &lda, reinterpret_cast<const MKL_Complex8*>(pB), &ldb, reinterpret_cast<MKL_Complex8*>(&beta), reinterpret_cast<MKL_Complex8*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                cgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex8*>(&alpha), reinterpret_cast<MKL_Complex8*>(pATmp), &lda, reinterpret_cast<const MKL_Complex8*>(pB), &ldb, reinterpret_cast<MKL_Complex8*>(&beta), reinterpret_cast<MKL_Complex8*>(pC), &ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            GT_Complex16 alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'C';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'C';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                zgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex16*>(&alpha), reinterpret_cast<const MKL_Complex16*>(pA), &lda, reinterpret_cast<const MKL_Complex16*>(pB), &ldb, reinterpret_cast<MKL_Complex16*>(&beta), reinterpret_cast<MKL_Complex16*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                zgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex16*>(&alpha), reinterpret_cast<MKL_Complex16*>(pATmp), &lda, reinterpret_cast<const MKL_Complex16*>(pB), &ldb, reinterpret_cast<MKL_Complex16*>(&beta), reinterpret_cast<MKL_Complex16*>(pC), &ldc);
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("GeneralMatrixProduct_gemm : unsupported type " << typeid(T).name() );
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GeneralMatrixProduct_gemm(hoNDArray<T>& C, const hoNDArray<T>& A, bool transA, const hoNDArray<T>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool GeneralMatrixProduct_gemm(hoMatrix<T>& C, 
+                            const hoMatrix<T>& A, bool transA, 
+                            const hoMatrix<T>& B, bool transB)
+{
+    try
+    {
+        char TA, TB;
+
+        MKL_INT lda = A.rows();
+        MKL_INT ldb = B.rows();
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        MKL_INT M = A.rows();
+        MKL_INT K = A.cols();
+        if ( transA )
+        {
+            M = A.cols();
+            K = A.rows();
+        }
+
+        MKL_INT K2 = B.rows();
+        MKL_INT N = B.cols();
+        if ( transB )
+        {
+            K2 = B.cols();
+            N = B.rows();
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.rows()!=M) || (C.cols()!=N) )
+        {
+            GADGET_CHECK_RETURN_FALSE(C.createMatrix(M, N));
+        }
+
+        T* pC = C.begin();
+        MKL_INT ldc = C.rows();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            float alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'T';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'T';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                sgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const float*>(pA), &lda, reinterpret_cast<const float*>(pB), &ldb, &beta, reinterpret_cast<float*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                sgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const float*>(pATmp), &lda, reinterpret_cast<const float*>(pB), &ldb, &beta, reinterpret_cast<float*>(pC), &ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            double alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'T';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'T';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                dgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const double*>(pA), &lda, reinterpret_cast<const double*>(pB), &ldb, &beta, reinterpret_cast<double*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                dgemm(&TA, &TB, &M, &N, &K, &alpha, reinterpret_cast<const double*>(pATmp), &lda, reinterpret_cast<const double*>(pB), &ldb, &beta, reinterpret_cast<double*>(pC), &ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            GT_Complex8 alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'C';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'C';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                cgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex8*>(&alpha), reinterpret_cast<const MKL_Complex8*>(pA), &lda, reinterpret_cast<const MKL_Complex8*>(pB), &ldb, reinterpret_cast<MKL_Complex8*>(&beta), reinterpret_cast<MKL_Complex8*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                cgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex8*>(&alpha), reinterpret_cast<MKL_Complex8*>(pATmp), &lda, reinterpret_cast<const MKL_Complex8*>(pB), &ldb, reinterpret_cast<MKL_Complex8*>(&beta), reinterpret_cast<MKL_Complex8*>(pC), &ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            GT_Complex16 alpha(1), beta(0);
+
+            if ( transA )
+            {
+                TA = 'C';
+            }
+            else
+            {
+                TA = 'N';
+            }
+
+            if ( transB )
+            {
+                TB = 'C';
+            }
+            else
+            {
+                TB = 'N';
+            }
+
+            if ( &A != &C )
+            {
+                zgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex16*>(&alpha), reinterpret_cast<const MKL_Complex16*>(pA), &lda, reinterpret_cast<const MKL_Complex16*>(pB), &ldb, reinterpret_cast<MKL_Complex16*>(&beta), reinterpret_cast<MKL_Complex16*>(pC), &ldc);
+            }
+            else
+            {
+                hoNDArray<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                zgemm(&TA, &TB, &M, &N, &K, reinterpret_cast<MKL_Complex16*>(&alpha), reinterpret_cast<MKL_Complex16*>(pATmp), &lda, reinterpret_cast<const MKL_Complex16*>(pB), &ldb, reinterpret_cast<MKL_Complex16*>(&beta), reinterpret_cast<MKL_Complex16*>(pC), &ldc);
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("GeneralMatrixProduct_gemm : unsupported type " << typeid(T).name() );
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GeneralMatrixProduct_gemm(hoMatrix<T>& C, const hoMatrix<T>& A, bool transA, const hoMatrix<T>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool CholeskyHermitianPositiveDefinite_potrf(hoMatrix<T>& A, char uplo)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        MKL_INT info;
+        lapack_int n = (lapack_int)(A.rows());
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)(A.rows());
+
+        if ( typeid(T)==typeid(float) )
+        {
+            spotrf(&uplo, &n, reinterpret_cast<float*>(pA), &lda, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            dpotrf(&uplo, &n, reinterpret_cast<double*>(pA), &lda, &info);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            cpotrf(&uplo, &n, reinterpret_cast<MKL_Complex8*>(pA), &lda, &info);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            zpotrf(&uplo, &n, reinterpret_cast<MKL_Complex16*>(pA), &lda, &info);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("CholeskyHermitianPositiveDefinite_potrf : unsupported type " << typeid(T).name());
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+
+        if ( uplo == 'U' )
+        {
+            GADGET_CHECK_RETURN_FALSE(A.lowerTri(0));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(A.upperTri(0));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in CholeskyHermitianPositiveDefinite_potrf(hoMatrix<T>& A, char uplo) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool EigenAnalysis_syev_heev(hoMatrix<T>& A, hoMatrix<typename realType<T>::Type>& eigenValue)
+{
+    try
+    {
+        long long M = (long long)A.rows();
+        GADGET_CHECK_RETURN_FALSE(A.cols() == M);
+
+        if ( (eigenValue.rows()!=M) || (eigenValue.cols()!=1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(eigenValue.createMatrix(M, 1));
+        }
+
+        MKL_INT info;
+        char jobz = 'V';
+        char uplo = 'L';
+        T* pA = A.begin();
+        typename realType<T>::Type* pEV = eigenValue.begin();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_ssyev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<float*>(pA), M, reinterpret_cast<float*>(pEV));
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dsyev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<double*>(pA), M, reinterpret_cast<double*>(pEV));
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cheev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<MKL_Complex8*>(pA), M, reinterpret_cast<float*>(pEV));
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zheev(LAPACK_COL_MAJOR, jobz, uplo, M, reinterpret_cast<MKL_Complex16*>(pA), M, reinterpret_cast<double*>(pEV));
+        }
+        else
+        {
+            GADGET_ERROR_MSG("EigenAnalysis_syev_heev : unsupported type " << typeid(T).name());
+            return false;
+        }
+
+        /*long long lwork;
+        lwork = M*M;
+
+        if ( typeid(T)==typeid(float) )
+        {
+            hoNDArray<float> work(M, M);
+            ssyev(&jobz, &uplo, &M, reinterpret_cast<float*>(pA), &M, reinterpret_cast<float*>(pEV), work.begin(), &lwork, &info);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            hoNDArray<double> work(M, M);
+            dsyev(&jobz, &uplo, &M, reinterpret_cast<double*>(pA), &M, reinterpret_cast<double*>(pEV), work.begin(), &lwork, &info);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            hoNDArray<GT_Complex8> work(M, M);
+            hoNDArray<float> rwork(3*M);
+            cheev(&jobz, &uplo, &M, reinterpret_cast<MKL_Complex8*>(pA), &M, reinterpret_cast<float*>(pEV), reinterpret_cast<MKL_Complex8*>(work.begin()), &lwork, rwork.begin(), &info);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            hoNDArray<GT_Complex16> work(M, M);
+            hoNDArray<double> rwork(3*M);
+            zheev(&jobz, &uplo, &M, reinterpret_cast<MKL_Complex16*>(pA), &M, reinterpret_cast<double*>(pEV), reinterpret_cast<MKL_Complex16*>(work.begin()), &lwork, rwork.begin(), &info);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("EigenAnalysis_syev_heev : unsupported type " << typeid(T).name());
+            return false;
+        }*/
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in EigenAnalysis_syev_heev(hoMatrix<T>& A, hoMatrix<typename realType<T>::Type>& eigenValue) ... ");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool EigenAnalysis_syev_heev2(hoMatrix<T>& A, hoMatrix<T>& eigenValue)
+{
+    try
+    {
+        long long M = (long long)A.rows();
+        GADGET_CHECK_RETURN_FALSE(A.cols() == M);
+
+        if ( (eigenValue.rows()!=M) || (eigenValue.cols()!=1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(eigenValue.createMatrix(M, 1));
+        }
+
+        hoMatrix<typename realType<T>::Type> D(M, 1);
+        GADGET_CHECK_RETURN_FALSE(EigenAnalysis_syev_heev(A, D));
+        //GADGET_CHECK_RETURN_FALSE(eigenValue.copyFrom(D));
+        eigenValue.copyFrom(D);
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in EigenAnalysis_syev_heev2(hoMatrix<T>& A, hoMatrix<T>& eigenValue) ... ");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool SymmetricHermitianPositiveDefiniteInverse_potri(hoMatrix<T>& A)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        MKL_INT info;
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.rows();
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.rows();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_spotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_spotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dpotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_dpotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cpotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_cpotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zpotrf(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_zpotri(LAPACK_COL_MAJOR, uplo, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("SymmetricHermitianPositiveDefiniteInverse_potri : unsupported type " << typeid(T).name());
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in SymmetricHermitianPositiveDefiniteInverse_potri(hoMatrix<T>& A) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool TriangularInverse_trtri(hoMatrix<T>& A, char uplo)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        MKL_INT info;
+        char diag = 'N';
+        lapack_int n = (lapack_int)A.rows();
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.rows();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_strtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<float*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dtrtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<double*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_ctrtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_ztrtri(LAPACK_COL_MAJOR, uplo, diag, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("TriangularInverse_trtri : unsupported type " << typeid(T).name());
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in TriangularInverse_trtri(hoMatrix<float>& A, char uplo) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool SymmetricHermitianPositiveDefiniteLinearSystem_posv(hoMatrix<T>& A, hoMatrix<T>& b)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        if( b.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==b.rows());
+
+        MKL_INT info;
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.rows();
+        lapack_int nrhs = (lapack_int)b.cols();
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.rows();
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.rows();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_sposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<float*>(pA), lda, reinterpret_cast<float*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<double*>(pA), lda, reinterpret_cast<double*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<MKL_Complex8*>(pA), lda, reinterpret_cast<MKL_Complex8*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zposv(LAPACK_COL_MAJOR, uplo, n, nrhs, reinterpret_cast<MKL_Complex16*>(pA), lda, reinterpret_cast<MKL_Complex16*>(pB), ldb);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("SymmetricHermitianPositiveDefiniteLinearSystem_posv : unsupported type " << typeid(T).name());
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in SymmetricHermitianPositiveDefiniteLinearSystem_posv(hoMatrix<float>& A, hoMatrix<float>& b) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool SolveLinearSystem_Tikhonov(hoMatrix<T>& A, hoMatrix<T>& b, hoMatrix<T>& x, double lamda)
+{
+    GADGET_CHECK_RETURN_FALSE(b.rows()==A.rows());
+
+    hoMatrix<T> AHA(A.cols(), A.cols());
+    GADGET_CHECK_RETURN_FALSE(GeneralMatrixProduct_gemm(AHA, A, true, A, false));
+
+    GADGET_CHECK_RETURN_FALSE(x.createMatrix(A.cols(), b.cols()));
+    GADGET_CHECK_RETURN_FALSE(GeneralMatrixProduct_gemm(x, A, true, b, false));
+
+    // apply the Tikhonov regularization
+    // Ideally, we shall apply the regularization is lamda*maxEigenValue
+    // However, computing the maximal eigenvalue is computational intensive
+    // A natural alternative is to use the trace of AHA matrix, which is the sum of all eigen values
+    // Since all eigen values are positive, the lamda*maxEigenValue is only ~10-20% different from lamda*sum(all eigenValues)
+    // for more information, refer to:
+    // Tikhonov A.N., Goncharsky A.V., Stepanov V.V., Yagola A.G., 1995, 
+    // Numerical Methods for the Solution of Ill-Posed Problems, Kluwer Academic Publishers.
+
+    size_t col = AHA.cols();
+    size_t c;
+
+    double trA = std::abs(AHA(0, 0));
+    for ( c=1; c<col; c++ )
+    {
+        trA += std::abs(AHA(c, c));
+    }
+
+    double value = trA*lamda/col;
+    for ( c=0; c<col; c++ )
+    {
+        AHA(c,c) = std::abs(AHA(c, c)) + value;
+    }
+
+    GADGET_CHECK_RETURN_FALSE(SymmetricHermitianPositiveDefiniteLinearSystem_posv(AHA, x));
+
+    return true;
+}
+
+#endif // USE_MKL
+
+}
diff --git a/toolboxes/core/cpu/hoMatrix.h b/toolboxes/core/cpu/hoMatrix.h
new file mode 100644
index 0000000..98ec73a
--- /dev/null
+++ b/toolboxes/core/cpu/hoMatrix.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "ho2DArray.h"
+#include "complext.h"
+
+#ifdef USE_MKL
+    #include "mkl.h"
+#endif // USE_MKL
+
+#ifdef GT_Complex8
+    #undef GT_Complex8
+#endif // GT_Complex8
+typedef std::complex<float> GT_Complex8;
+
+#ifdef GT_Complex16
+    #undef GT_Complex16
+#endif // GT_Complex16
+typedef std::complex<double> GT_Complex16; 
+
+namespace Gadgetron{
+
+// the hoMatrix stores every column as the first dimension
+// it has the column-wise storage
+template <class T> class  hoMatrix : public ho2DArray<T>
+{
+public:
+
+    typedef hoMatrix<T> Self;
+    typedef ho2DArray<T> BaseClass;
+
+    hoMatrix();
+    hoMatrix(size_t rows, size_t cols);
+    hoMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~hoMatrix();
+
+    hoMatrix(const hoMatrix<T>& a);
+    hoMatrix<T>& operator=(const hoMatrix& rhs);
+
+    virtual bool createMatrix(size_t rows, size_t cols);
+    virtual bool createMatrix(size_t rows, size_t cols, T* data, bool delete_data_on_destruct = false);
+
+    T& operator()(size_t r , size_t c);
+    const T& operator()(size_t r , size_t c) const;
+
+    size_t rows() const;
+    size_t cols() const;
+
+    // assign the upper/lower triangle matrix as a fixed value
+    bool upperTri(const T& v);
+    bool lowerTri(const T& v);
+
+    // sum along row or col
+    bool sumOverRow(hoNDArray<T>& res) const;
+    bool sumOverCol(hoNDArray<T>& res) const;
+
+    // get the sub matrix
+    bool subMatrix(Self& res, size_t startR, size_t endR, size_t startC, size_t endC) const;
+
+    bool operator == (const Self& m) const;
+    bool operator != (const Self& m) const;
+
+    virtual void print(std::ostream& os) const;
+
+protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+    using BaseClass::accesser_;
+};
+
+}
+
+#include <hoMatrix.cpp>
diff --git a/toolboxes/core/cpu/hoMatrix.hxx b/toolboxes/core/cpu/hoMatrix.hxx
new file mode 100644
index 0000000..ddd0c85
--- /dev/null
+++ b/toolboxes/core/cpu/hoMatrix.hxx
@@ -0,0 +1,791 @@
+namespace Gadgetron
+{
+
+template <typename T> 
+hoMatrix<T>::hoMatrix() : BaseClass()
+{
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(unsigned int rows, unsigned int cols) : BaseClass(cols, rows)
+{
+    this->fill(T(0));
+}
+
+template <typename T> 
+hoMatrix<T>::hoMatrix(unsigned int rows, unsigned int cols, T* data, bool delete_data_on_destruct)
+{
+    std::vector<unsigned int> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+    this->create(dimensions,data,delete_data_on_destruct);
+    GADGET_CHECK_THROW(init_accesser());
+}
+
+template <typename T> 
+hoMatrix<T>::~hoMatrix()
+{
+
+}
+
+template <typename T> 
+bool hoMatrix<T>::createMatrix(unsigned int rows, unsigned int cols)
+{
+    return this->createArray(cols, rows);
+}
+
+template <typename T> 
+inline T& hoMatrix<T>::operator()(size_t r, size_t c)
+{
+    GADGET_DEBUG_CHECK_THROW(c<(*dimensions_)[0] && r<(*dimensions_)[1]);
+    return accesser_[r][c];
+}
+
+template <typename T> 
+inline const T& hoMatrix<T>::operator()(size_t r, size_t c) const
+{
+    GADGET_DEBUG_CHECK_THROW(c<(*dimensions_)[0] && r<(*dimensions_)[1]);
+    return accesser_[r][c];
+}
+
+template <typename T> 
+inline unsigned int hoMatrix<T>::rows() const
+{
+    return (*dimensions_)[1];
+}
+
+template <typename T> 
+inline unsigned int hoMatrix<T>::cols() const
+{
+    return (*dimensions_)[0];
+}
+
+template <typename T> 
+bool hoMatrix<T>::upperTri(const T& v)
+{
+    try
+    {
+        unsigned int r, c;
+        for (r=0; r<(*dimensions_)[1]; r++)
+        {
+            for (c=0; c<(*dimensions_)[0]; c++)
+            {
+                if ( c > r )
+                {
+                    (*this)(r, c) = v;
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::upperTri(const T& v) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::lowerTri(const T& v)
+{
+    try
+    {
+        unsigned int r, c;
+        for (r=0; r<(*dimensions_)[1]; r++)
+        {
+            for (c=0; c<(*dimensions_)[0]; c++)
+            {
+                if ( r > c )
+                {
+                    (*this)(r, c) = v;
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in hoMatrix<T>::lowerTri(const T& v) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::operator == (const Self& m) const
+{
+    GADGET_CHECK_RETURN_FALSE(this->dimensions_equal(&m));
+    for ( size_t i=0; i<elements_; i++ )
+    { 
+        if (std::abs(data_[i]-m.data_[i])>DBL_EPSILON) 
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T> 
+bool hoMatrix<T>::operator != (const Self& m) const
+{
+    return !(*this==m);
+}
+
+template <typename T> 
+void hoMatrix<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os.unsetf(std::ios::scientific);
+
+    os << "hoMatrix : " << (*dimensions_)[1] << " " << (*dimensions_)[0] << " : " << std::string(typeid(T).name()) << endl;
+    unsigned int r, c;
+    for (r=0; r<(*dimensions_)[1]; r++) 
+    {
+        os << "r " << r << ":\t";
+        for (c=0; c<(*dimensions_)[0]; c++)
+        {
+            os << setprecision(16) << (*this)(r,c) << "\t";
+        }
+        os << endl; 
+    }
+}
+
+template <typename T> 
+bool copyL2U(hoMatrix<T>& A)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        unsigned int R = A.rows();
+        unsigned int C = A.cols();
+
+        unsigned int row, col;
+        for(row=0; row<R; row++) 
+        {
+            for(col=0; col<row; col++ )
+            {
+                A(col, row) = A(row, col);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyL2U(hoMatrix<T>& A) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyL2U(hoMatrix<T>& A, bool conj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        unsigned int R = A.rows();
+        unsigned int row, col;
+
+        if ( conj )
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=0; col<row; col++ )
+                {
+                    A(col, row) = std::conj(A(row, col));
+                }
+            }
+        }
+        else
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=0; col<row; col++ )
+                {
+                    A(col, row) = A(row, col);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyL2U(hoMatrix<T>& A, bool conj) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyU2L(hoMatrix<T>& A)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        unsigned int R = A.rows();
+        unsigned int C = A.cols();
+
+        unsigned int row, col;
+        for(row=0; row<R; row++) 
+        {
+            for(col=row+1; col<C; col++ )
+            {
+                A(col, row) = A(row, col);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyU2L(hoMatrix<T>& A) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool copyU2L(hoMatrix<T>& A, bool conj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        unsigned int R = A.rows();
+        unsigned int C = A.cols();
+
+        unsigned int row, col;
+
+        if ( conj )
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=row+1; col<C; col++ )
+                {
+                    A(col, row) = std::conj(A(row, col));
+                }
+            }
+        }
+        else
+        {
+            for(row=0; row<R; row++) 
+            {
+                for(col=row+1; col<C; col++ )
+                {
+                    A(col, row) = A(row, col);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in copyU2L(hoMatrix<T>& A, bool conj) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool trans(const hoMatrix<T>& A, hoMatrix<T>& AT)
+{
+    try
+    {
+        if ( A.get_number_of_elements() == 0 ) return true;
+
+        if ( !AT.dimensions_equal(&A) )
+        {
+            AT.createMatrix(A.rows(), A.cols());
+        }
+
+        int r, c;
+        #pragma omp parallel for default(none) private(r, c) shared(A, AT)
+        for ( c=0; c<(int)A.cols(); c++ )
+        {
+            for ( r=0; r<(int)A.rows(); r++ )
+            {
+                AT(c,r) = A(r,c);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in trans(const hoMatrix<T>& A, hoMatrix<T>& AT) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool conjugatetrans(const hoMatrix<T>& A, hoMatrix<T>& AH)
+{
+    try
+    {
+        if ( A.get_number_of_elements() == 0 ) return true;
+
+        if ( !AH.dimensions_equal(&A) )
+        {
+            AH.createMatrix(A.rows(), A.cols());
+        }
+
+        int r, c;
+        #pragma omp parallel for default(none) private(r, c) shared(A, AH)
+        for ( c=0; c<(int)A.cols(); c++ )
+        {
+            for ( r=0; r<(int)A.rows(); r++ )
+            {
+                AH(c,r) = std::conj(A(r,c));
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in conjugatetrans(const hoMatrix<T>& A, hoMatrix<T>& AH) ... ");
+        return false;
+    }
+    return true;
+}
+
+// following matrix computation calls MKL functions
+#ifdef USE_MKL
+
+template<typename T> 
+bool GeneralMatrixProduct_gemm(hoMatrix<T>& C, 
+                            const hoMatrix<T>& A, bool transA, 
+                            const hoMatrix<T>& B, bool transB)
+{
+    try
+    {
+        CBLAS_TRANSPOSE TA, TB;
+
+        MKL_INT lda = A.cols();
+        MKL_INT ldb = B.cols();
+        const T* pA = A.begin(); 
+        const T* pB = B.begin(); 
+
+        MKL_INT M = A.rows();
+        MKL_INT K = A.cols();
+        if ( transA )
+        { 
+            M = A.cols();
+            K = A.rows();
+        }
+
+        MKL_INT N = B.cols();
+        MKL_INT K2 = B.rows();
+        if ( transB )
+        { 
+            N = B.rows();
+            K2 = B.cols();
+        }
+
+        GADGET_CHECK_RETURN_FALSE(K==K2);
+        if ( (C.rows()!=M) || (C.cols()!=N) )
+        {
+            GADGET_CHECK_RETURN_FALSE(C.createMatrix(M, N));
+        }
+
+        T* pC = C.begin();
+        MKL_INT ldc = C.cols();
+
+        T alpha(1), beta(0);
+
+        if ( typeid(T)==typeid(float) )
+        {
+            if ( transA )
+            {
+                TA = CblasTrans;
+            }
+            else
+            {
+                TA = CblasNoTrans;
+            }
+
+            if ( transB )
+            {
+                TB = CblasTrans;
+            }
+            else
+            {
+                TB = CblasNoTrans;
+            }
+
+            if ( &A != &C )
+            {
+                cblas_sgemm(CblasRowMajor, TA, TB, M, N, K, 1, reinterpret_cast<const float*>(pA), lda, reinterpret_cast<const float*>(pB), ldb, 0, reinterpret_cast<float*>(pC), ldc);
+            }
+            else
+            {
+                hoMatrix<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, 1, reinterpret_cast<const float*>(pATmp), lda, reinterpret_cast<const float*>(pB), ldb, 0, reinterpret_cast<float*>(pC), ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            if ( transA )
+            {
+                TA = CblasTrans;
+            }
+            else
+            {
+                TA = CblasNoTrans;
+            }
+
+            if ( transB )
+            {
+                TB = CblasTrans;
+            }
+            else
+            {
+                TB = CblasNoTrans;
+            }
+
+            if ( &A != &C )
+            {
+                cblas_dgemm(CblasRowMajor, TA, TB, M, N, K, 1, reinterpret_cast<const double*>(pA), lda, reinterpret_cast<const double*>(pB), ldb, 0, reinterpret_cast<double*>(pC), ldc);
+            }
+            else
+            {
+                hoMatrix<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, 1, reinterpret_cast<const double*>(pATmp), lda, reinterpret_cast<const double*>(pB), ldb, 0, reinterpret_cast<double*>(pC), ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            if ( transA )
+            {
+                TA = CblasConjTrans;
+            }
+            else
+            {
+                TA = CblasNoTrans;
+            }
+
+            if ( transB )
+            {
+                TB = CblasConjTrans;
+            }
+            else
+            {
+                TB = CblasNoTrans;
+            }
+
+            if ( &A != &C )
+            {
+                cblas_cgemm(CblasRowMajor, TA, TB, M, N, K, &alpha, pA, lda, pB, ldb, &beta, pC, ldc);
+            }
+            else
+            {
+                hoMatrix<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                cblas_cgemm(CblasRowMajor, TransA, TransB, M, N, K, &alpha, pATmp, lda, pB, ldb, &beta, pC, ldc);
+            }
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            if ( transA )
+            {
+                TA = CblasConjTrans;
+            }
+            else
+            {
+                TA = CblasNoTrans;
+            }
+
+            if ( transB )
+            {
+                TB = CblasConjTrans;
+            }
+            else
+            {
+                TB = CblasNoTrans;
+            }
+
+            if ( &A != &C )
+            {
+                cblas_zgemm(CblasRowMajor, TA, TB, M, N, K, &alpha, pA, lda, pB, ldb, &beta, pC, ldc);
+            }
+            else
+            {
+                hoMatrix<T> aTmp(A);
+                T* pATmp = aTmp.begin();
+                cblas_zgemm(CblasRowMajor, TransA, TransB, M, N, K, &alpha, pATmp, lda, pB, ldb, &beta, pC, ldc);
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("GeneralMatrixProduct_gemm : unsupported type " << typeid(T));
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in GeneralMatrixProduct_gemm(hoMatrix<T>& C, const hoMatrix<T>& A, bool transA, const hoMatrix<T>& B, bool transB) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool CholeskyHermitianPositiveDefinite_potrf(hoMatrix<T>& A, char uplo)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        int info;
+        lapack_int n = (lapack_int)(A.rows());
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)(A.cols());
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_spotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cpotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zpotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("CholeskyHermitianPositiveDefinite_potrf : unsupported type " << typeid(T));
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+
+        if ( uplo == 'U' )
+        {
+            GADGET_CHECK_RETURN_FALSE(A.lowerTri(0));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(A.upperTri(0));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in CholeskyHermitianPositiveDefinite_potrf(hoMatrix<T>& A, char uplo) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool EigenAnalysis_syev_heev(hoMatrix<T>& A, hoMatrix<typename realType<T>::Type>& eigenValue)
+{
+    try
+    {
+        int M = (int)A.rows();
+        GADGET_CHECK_RETURN_FALSE(A.cols() == M));
+
+        if ( (eigenValue.rows()!=M) || (eigenValue.cols()!=1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(D.createMatrix(M, 1));
+        }
+
+        int info;
+        char jobz = 'V';
+        char uplo = 'L';
+        T* pA = A.begin();
+        typename realType<T>::Type* pEV = eigenValue.begin();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_ssyev(LAPACK_ROW_MAJOR, jobz, uplo, M, reinterpret_cast<float*>(pA), M, reinterpret_cast<float*>(pEV));
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dsyev(LAPACK_ROW_MAJOR, jobz, uplo, M, reinterpret_cast<double*>(pA), M, reinterpret_cast<double*>(pEV));
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cheev(LAPACK_ROW_MAJOR, jobz, uplo, M, reinterpret_cast<MKL_Complex8*>(pA), M, reinterpret_cast<float*>(pEV));
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zheev(LAPACK_ROW_MAJOR, jobz, uplo, M, reinterpret_cast<MKL_Complex16*>(pA), M, reinterpret_cast<double*>(pEV));
+        }
+        else
+        {
+            GADGET_ERROR_MSG("EigenAnalysis_syev_heev : unsupported type " << typeid(T));
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in EigenAnalysis_syev_heev(hoMatrix<T>& A, hoMatrix<typename realType<T>::Type>& eigenValue) ... ");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool SymmetricHermitianPositiveDefiniteInverse_potri(hoMatrix<T>& A)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        int info;
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.rows();
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.cols();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_spotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_spotri(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<float*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_dpotri(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<double*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cpotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_cpotri(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zpotrf(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+
+            info = LAPACKE_zpotri(LAPACK_ROW_MAJOR, uplo, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+            GADGET_CHECK_RETURN_FALSE(info==0);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("SymmetricHermitianPositiveDefiniteInverse_potri : unsupported type " << typeid(T));
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in SymmetricHermitianPositiveDefiniteInverse_potri(hoMatrix<T>& A) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool TriangularInverse_trtri(hoMatrix<T>& A, char uplo)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==A.cols());
+
+        int info;
+        char diag = 'N';
+        lapack_int n = (lapack_int)A.rows();
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.cols();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_strtri(LAPACK_ROW_MAJOR, uplo, diag, n, reinterpret_cast<float*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dtrtri(LAPACK_ROW_MAJOR, uplo, diag, n, reinterpret_cast<double*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_ctrtri(LAPACK_ROW_MAJOR, uplo, diag, n, reinterpret_cast<MKL_Complex8*>(pA), lda);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_ztrtri(LAPACK_ROW_MAJOR, uplo, diag, n, reinterpret_cast<MKL_Complex16*>(pA), lda);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("TriangularInverse_trtri : unsupported type " << typeid(T));
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in TriangularInverse_trtri(hoMatrix<float>& A, char uplo) ...");
+        return false;
+    }
+    return true;
+}
+
+template<typename T> 
+bool SymmetricHermitianPositiveDefiniteLinearSystem_posv(hoMatrix<T>& A, hoMatrix<T>& b)
+{
+    try
+    {
+        if( A.get_number_of_elements()==0 ) return true;
+        if( b.get_number_of_elements()==0 ) return true;
+        GADGET_CHECK_RETURN_FALSE(A.rows()==b.rows());
+
+        int info;
+        char uplo = 'L';
+        lapack_int n = (lapack_int)A.rows();
+        lapack_int nrhs = (lapack_int)b.cols();
+        T* pA = A.begin();
+        lapack_int lda = (lapack_int)A.cols();
+        T* pB = b.begin();
+        lapack_int ldb = (lapack_int)b.cols();
+
+        if ( typeid(T)==typeid(float) )
+        {
+            info = LAPACKE_sposv(LAPACK_ROW_MAJOR, uplo, n, nrhs, reinterpret_cast<float*>(pA), lda, reinterpret_cast<float*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(double) )
+        {
+            info = LAPACKE_dposv(LAPACK_ROW_MAJOR, uplo, n, nrhs, reinterpret_cast<double*>(pA), lda, reinterpret_cast<double*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(GT_Complex8) )
+        {
+            info = LAPACKE_cposv(LAPACK_ROW_MAJOR, uplo, n, nrhs, reinterpret_cast<MKL_Complex8*>(pA), lda, reinterpret_cast<MKL_Complex8*>(pB), ldb);
+        }
+        else if ( typeid(T)==typeid(GT_Complex16) )
+        {
+            info = LAPACKE_zposv(LAPACK_ROW_MAJOR, uplo, n, nrhs, reinterpret_cast<MKL_Complex16*>(pA), lda, reinterpret_cast<MKL_Complex16*>(pB), ldb);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("SymmetricHermitianPositiveDefiniteLinearSystem_posv : unsupported type " << typeid(T));
+            return false;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(info==0);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in SymmetricHermitianPositiveDefiniteLinearSystem_posv(hoMatrix<float>& A, hoMatrix<float>& b) ...");
+        return false;
+    }
+    return true;
+}
+
+#endif // USE_MKL
+
+}
diff --git a/toolboxes/core/cpu/hoNDArray.h b/toolboxes/core/cpu/hoNDArray.h
new file mode 100644
index 0000000..8c033f5
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray.h
@@ -0,0 +1,193 @@
+/** \file hoNDArray.h
+    \brief CPU-based N-dimensional array (data container)
+*/
+
+#pragma once
+
+#include "NDArray.h"
+#include "complext.h"
+#include "vector_td.h"
+#include "GadgetronCommon.h"
+#include "SerializableObject.h"
+
+#include "cpucore_export.h"
+
+#include <string.h>
+#include <float.h>
+#include <boost/shared_ptr.hpp>
+#include <stdexcept>
+
+#ifdef USE_MKL
+#include "mkl.h"
+#endif
+
+namespace Gadgetron{
+
+  template <typename T> class hoNDArray : public NDArray<T>, public SerializableObject
+  {
+  public:
+
+    typedef NDArray<T> BaseClass;
+
+    hoNDArray();
+
+    hoNDArray(std::vector<size_t> &dimensions);
+    hoNDArray(std::vector<size_t> *dimensions);
+    hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+    hoNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    hoNDArray(size_t len);
+    hoNDArray(size_t sx, size_t sy);
+    hoNDArray(size_t sx, size_t sy, size_t sz);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+
+    hoNDArray(size_t len, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct = false);
+    hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct = false);
+
+    virtual ~hoNDArray();
+
+    // Copy constructors
+    hoNDArray(const hoNDArray<T> &a);
+    hoNDArray(const hoNDArray<T> *a);
+
+    // Assignment operator
+    hoNDArray& operator=(const hoNDArray& rhs);
+
+    virtual void create(std::vector<size_t>& dimensions);
+    virtual void create(std::vector<size_t> *dimensions);
+    virtual void create(boost::shared_ptr< std::vector<size_t> > dimensions);
+
+    virtual void create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false);
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false);
+    virtual void create(boost::shared_ptr<std::vector<size_t>  > dimensions, T* data, bool delete_data_on_destruct = false);
+
+    virtual void create(size_t len);
+    virtual void create(size_t sx, size_t sy);
+    virtual void create(size_t sx, size_t sy, size_t sz);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su);
+
+    virtual void create(size_t len, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct = false);
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, T* data, bool delete_data_on_destruct = false);
+
+    void fill(T value);
+
+    T* begin();
+    const T* begin() const;
+
+    T* end();
+    const T* end() const;
+
+    T& at( size_t idx );
+    const T& at( size_t idx ) const;
+
+    T& operator[]( size_t idx );
+
+    //T& operator()( size_t idx );
+    //const T& operator()( size_t idx ) const;
+
+    //T& operator()( const std::vector<size_t>& ind );
+    //const T& operator()( const std::vector<size_t>& ind ) const;
+
+    template<typename T2> 
+      bool copyFrom(const hoNDArray<T2>& aArray) // Should be a void function
+    {
+      if ( !this->dimensions_equal(&aArray) ){
+        this->create(aArray.get_dimensions());
+      }      
+      for ( size_t i=0; i<elements_; i++ ){
+        data_[i] = static_cast<T>(aArray(i));
+      }
+      return true;
+    }
+  
+    void get_sub_array(const std::vector<size_t>& start, std::vector<size_t>& size, hoNDArray<T>& out);
+
+    virtual void print(std::ostream& os) const;
+    virtual void printContent(std::ostream& os) const;
+
+    virtual bool serialize(char*& buf, size_t& len) const;
+    virtual bool deserialize(char* buf, size_t& len);
+
+  protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    virtual void allocate_memory();
+    virtual void deallocate_memory();
+
+    // Generic allocator / deallocator
+    //
+
+    template<class X> void _allocate_memory( size_t size, X** data )
+    {
+      *data = new (std::nothrow) X[size];
+    }
+
+    template<class X> void _deallocate_memory( X* data )
+    {
+      delete [] data;
+    }
+
+    // Overload these instances to avoid invoking the element class constructor/destructor
+    //
+
+    virtual void _allocate_memory( size_t size, float** data );
+    virtual void _deallocate_memory( float* data );
+
+    virtual void _allocate_memory( size_t size, double** data );
+    virtual void _deallocate_memory( double* data );
+
+    virtual void _allocate_memory( size_t size, std::complex<float>** data );
+    virtual void _deallocate_memory( std::complex<float>* data );
+
+    virtual void _allocate_memory( size_t size, std::complex<double>** data );
+    virtual void _deallocate_memory( std::complex<double>* data );
+
+    virtual void _allocate_memory( size_t size, float_complext** data );
+    virtual void _deallocate_memory( float_complext* data );
+
+    virtual void _allocate_memory( size_t size, double_complext** data );
+    virtual void _deallocate_memory( double_complext* data );
+
+    template<class TYPE, unsigned int D> void _allocate_memory( size_t size, vector_td<TYPE,D>** data )
+    {
+      *data = (vector_td<TYPE,D>*) malloc( size*sizeof(vector_td<TYPE,D>) );
+    }
+
+    template<class TYPE, unsigned int D>  void _deallocate_memory( vector_td<TYPE,D>* data )
+    {
+      free( data );
+    }
+  };
+}
+
+#include "hoNDArray.hxx"
diff --git a/toolboxes/core/cpu/hoNDArray.hxx b/toolboxes/core/cpu/hoNDArray.hxx
new file mode 100644
index 0000000..51a525e
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray.hxx
@@ -0,0 +1,980 @@
+// This file is not to be included by anyone else than hoNDArray.h
+// Contains the "private" implementation of the container·
+//
+
+namespace Gadgetron
+{
+  template <typename T> 
+  hoNDArray<T>::hoNDArray() : NDArray<T>::NDArray() 
+  {
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(std::vector<size_t> *dimensions) : NDArray<T>::NDArray()
+  {
+    this->create(dimensions);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(std::vector<size_t> &dimensions) : NDArray<T>::NDArray()
+  {
+    this->create(dimensions);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions) : NDArray<T>::NDArray()
+  {
+    this->create(dimensions);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t len) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(1);
+    dim[0] = len;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(8);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    this->create(dimensions,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    this->create(dimensions,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    this->create(dimensions,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t len, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(1);
+    dim[0] = len;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct) : NDArray<T>::NDArray()
+  {
+    std::vector<size_t> dim(8);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    this->create(&dim,data,delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  hoNDArray<T>::~hoNDArray()
+  {
+    if (this->delete_data_on_destruct_){
+      deallocate_memory();
+    }
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(const hoNDArray<T>  *a)
+  {
+    if(!a) throw std::runtime_error("hoNDArray<T>::hoNDArray(): 0x0 pointer provided");
+    this->data_ = 0;
+    this->dimensions_ = a->dimensions_;
+    this->offsetFactors_ = a->offsetFactors_;
+    allocate_memory();
+    memcpy( this->data_, a->data_, this->elements_*sizeof(T) );
+  }
+
+  template <typename T> 
+  hoNDArray<T>::hoNDArray(const hoNDArray<T> &a)
+  {
+    this->data_ = 0;
+    this->dimensions_ = a.dimensions_;
+    this->offsetFactors_ = a.offsetFactors_;
+    allocate_memory();
+    memcpy( this->data_, a.data_, this->elements_*sizeof(T) );
+  }
+
+  template <typename T> 
+  hoNDArray<T>& hoNDArray<T>::operator=(const hoNDArray<T>& rhs)
+  {
+    if ( &rhs == this ) return *this;
+
+    if ( rhs.get_number_of_elements() == 0 ){
+      this->clear();
+      return *this;
+    }
+
+    // Are the dimensions the same? Then we can just memcpy
+    if (this->dimensions_equal(&rhs)){
+      memcpy(this->data_, rhs.data_, this->elements_*sizeof(T));
+    }
+    else{
+      if (!this->delete_data_on_destruct_){
+        throw std::runtime_error("Array dimensions mismatch in hoNDArray::operator=. Cannot change dimensions of non-destructable array.");        
+      }
+      deallocate_memory();
+      this->data_ = 0;
+      this->dimensions_ = rhs.dimensions_;
+      this->offsetFactors_ = rhs.offsetFactors_;
+      allocate_memory();
+      memcpy( this->data_, rhs.data_, this->elements_*sizeof(T) );
+    }
+    return *this;
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::create(std::vector<size_t>& dimensions)
+  {
+    if ( this->dimensions_equal(&dimensions) )
+      {
+        return;
+      }
+
+    this->clear();
+    BaseClass::create(dimensions);
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::create(std::vector<size_t> *dimensions)
+  {
+    if ( this->dimensions_equal(dimensions) )
+      {
+        return;
+      }
+    this->clear();
+    BaseClass::create(dimensions);
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::create(boost::shared_ptr< std::vector<size_t> > dimensions)
+  {
+    if ( this->dimensions_equal(dimensions.get()) )
+      {
+        return;
+      }
+    this->clear();
+    BaseClass::create(dimensions);
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct) 
+  {
+    if(!dimensions) throw std::runtime_error("hoNDArray<T>::create(): 0x0 pointer provided");
+    if(!data) throw std::runtime_error("hoNDArray<T>::create(): 0x0 pointer provided");
+
+    if ( this->dimensions_equal(dimensions) )
+      {
+        if ( this->delete_data_on_destruct_ ){
+          this->deallocate_memory();
+        }
+        
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+      }
+    else
+      {
+        if ( this->delete_data_on_destruct_ ){
+          this->deallocate_memory();
+          this->data_ = NULL;
+        }
+        
+        BaseClass::create(dimensions, data, delete_data_on_destruct);
+      }
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct) 
+  {
+    if(!data) throw std::runtime_error("hoNDArray<T>::create(): 0x0 pointer provided");
+
+    if ( this->dimensions_equal(&dimensions) )
+      {
+        if ( this->delete_data_on_destruct_ ){
+          this->deallocate_memory();
+        }
+        
+        this->data_ = data;
+        this->delete_data_on_destruct_ = delete_data_on_destruct;
+      }
+    else
+      {
+        if ( this->delete_data_on_destruct_ ){
+          this->deallocate_memory();
+          this->data_ = NULL;
+        }
+        
+        BaseClass::create(dimensions, data, delete_data_on_destruct);
+      }
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct)
+  {
+    this->create(dimensions.get(), data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t len)
+  {
+    std::vector<size_t> dim(1);
+    dim[0] = len;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy)
+  {
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz)
+  {
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st)
+  {
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp)
+  {
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq)
+  {
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr)
+  {
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss)
+  {
+    std::vector<size_t> dim(8);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su)
+  {
+    std::vector<size_t> dim(9);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    dim[8] = su;
+    this->create(dim);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t len, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(1);
+    dim[0] = len;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(8);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, size_t su, T* data, bool delete_data_on_destruct)
+  {
+    std::vector<size_t> dim(9);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    dim[8] = su;
+    this->create(&dim, data, delete_data_on_destruct);
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::fill(T value)
+  {
+    std::fill(this->get_data_ptr(), this->get_data_ptr()+this->get_number_of_elements(), value);
+  }
+
+  template <typename T> 
+  inline T* hoNDArray<T>::begin()
+  {
+    return this->data_;
+  }
+
+  template <typename T> 
+  inline const T* hoNDArray<T>::begin() const
+  {
+    return this->data_;
+  }
+
+  template <typename T> 
+  inline T* hoNDArray<T>::end()
+  {
+    return (this->data_+this->elements_);
+  }
+
+  template <typename T> 
+  inline const T* hoNDArray<T>::end() const
+  {
+    return (this->data_+this->elements_);
+  }
+
+  template <typename T> 
+  inline T& hoNDArray<T>::at( size_t idx )
+  {
+    /*if( idx >= this->get_number_of_elements() )
+      {
+      BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::at(): index out of range."));
+      }*/
+    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    return this->get_data_ptr()[idx];
+  }
+
+  template <typename T> 
+  inline const T& hoNDArray<T>::at( size_t idx ) const
+  {
+    /*if( idx >= this->get_number_of_elements() )
+      {
+      BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::at(): index out of range."));
+      }*/
+    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    return this->get_data_ptr()[idx];
+  }
+
+  template <typename T> 
+  inline T& hoNDArray<T>::operator[]( size_t idx )
+  {
+    /*if( idx >= this->get_number_of_elements() )
+      {
+      BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator[]: index out of range."));
+      }*/
+    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+    return this->get_data_ptr()[idx];
+  }
+
+  //template <typename T> 
+  //inline T& hoNDArray<T>::operator()( size_t idx )
+  //{
+  //    /*if( idx >= this->get_number_of_elements() )
+  //    {
+  //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+  //    }*/
+  //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+  //    return this->get_data_ptr()[idx];
+  //}
+
+  //template <typename T> 
+  //inline const T& hoNDArray<T>::operator()( size_t idx ) const
+  //{
+  //    /*if( idx >= this->get_number_of_elements() )
+  //    {
+  //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+  //    }*/
+  //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+  //    return this->get_data_ptr()[idx];
+  //}
+
+  //template <typename T> 
+  //inline T& hoNDArray<T>::operator()( const std::vector<size_t>& ind )
+  //{
+  //    size_t idx = this->calculate_offset(ind);
+  //    /*if( idx >= this->get_number_of_elements() )
+  //    {
+  //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+  //    }*/
+  //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+  //    return this->get_data_ptr()[idx];
+  //}
+
+  //template <typename T> 
+  //inline const T& hoNDArray<T>::operator()( const std::vector<size_t>& ind ) const
+  //{
+  //    size_t idx = this->calculate_offset(ind);
+  //    /*if( idx >= this->get_number_of_elements() )
+  //    {
+  //    BOOST_THROW_EXCEPTION( runtime_error("hoNDArray::operator(): index out of range."));
+  //    }*/
+  //    GADGET_DEBUG_CHECK_THROW(idx < this->get_number_of_elements());
+  //    return this->get_data_ptr()[idx];
+  //}
+
+  template <typename T> 
+  void hoNDArray<T>::get_sub_array(const std::vector<size_t>& start, std::vector<size_t>& size, hoNDArray<T>& out)
+  {
+    if ( start.size() != size.size() ){
+      BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+    }
+
+    if ( start.size() != (*dimensions_).size() ){
+      BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+    }
+
+    out.create(&size);
+
+    if ( out.get_number_of_elements() == this->get_number_of_elements() ){
+      out = *this;
+      return;
+    }
+
+    std::vector<size_t> end(start.size());
+
+    size_t ii;
+    for ( ii=0; ii<start.size(); ii++ ){
+      end[ii] = start[ii] + size[ii] - 1;
+      if ( end[ii] >= (*dimensions_)[ii] ){
+        BOOST_THROW_EXCEPTION( runtime_error("hoNDArray<>::get_sub_array failed"));
+      }
+    }
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::print(std::ostream& os) const
+  {
+    using namespace std;
+
+    os.unsetf(std::ios::scientific);
+    os.setf(ios::fixed);
+
+    size_t i;
+
+    os << "--------------Gagdgetron ND Array -------------" << endl;
+    os << "Array dimension is : " << dimensions_->size() << endl;
+
+    os << "Array size is : ";
+    for (i=0; i<dimensions_->size(); i++ ) 
+      os << (*dimensions_)[i] << " "; 
+    os << endl;
+
+    int elemTypeSize = sizeof(T);
+    std::string elemTypeName = std::string(typeid(T).name());
+
+    os << "Array data type is : " << elemTypeName << std::endl;
+    os << "Byte number for each element is : " << elemTypeSize << std::endl;
+    os << "Number of array size in bytes is : ";
+    os << elements_*elemTypeSize << std::endl;
+
+    //os << "-------------------------------------------" << std::endl;
+    //size_t numOfPrints = 20;
+    //if ( this->elements_ < numOfPrints ) numOfPrints = this->elements_;
+    //for (i=0; i<numOfPrints; i++) 
+    //{
+    //    os << i << " = " << (*this)(i) << std::endl;
+    //}
+    //os << "-------------------------------------------" << std::endl;
+
+    os << std::endl;
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::printContent(std::ostream& os) const
+  {
+    using namespace std;
+
+    os.unsetf(std::ios::scientific);
+    os.setf(ios::fixed);
+
+    print(os);
+
+    //size_t i;
+
+    //os << "-------------------------------------------" << std::endl;
+    //size_t numOfPrints = this->elements_;
+    //if ( this->elements_ < numOfPrints ) numOfPrints = this->elements_;
+    //for (i=0; i<numOfPrints; i++) 
+    //{
+    //    os << i << " = " << (*this)(i) << std::endl;
+    //}
+    //os << "-------------------------------------------" << std::endl;
+    //os << std::endl;
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::allocate_memory()
+  {
+    deallocate_memory();
+
+    this->elements_ = (*this->dimensions_)[0];
+    for (size_t i = 1; i < this->dimensions_->size(); i++)
+      {
+        this->elements_ *= (*this->dimensions_)[i];
+      }
+
+    if ( this->elements_ > 0 )
+      {
+        this->_allocate_memory(this->elements_, &this->data_);
+
+        if( this->data_ == 0x0 )
+          {
+            BOOST_THROW_EXCEPTION( bad_alloc("hoNDArray<>::allocate memory failed"));
+          }
+
+        this->delete_data_on_destruct_ = true;
+
+        // memset(this->data_, 0, sizeof(T)*this->elements_);
+      }
+  }
+
+  template <typename T> 
+  void hoNDArray<T>::deallocate_memory()
+  {
+    if( this->data_ ){
+      this->_deallocate_memory( this->data_ );
+      this->data_ = 0x0;
+    }
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_allocate_memory( size_t size, float** data )
+  {
+#ifdef USE_MKL
+    *data = (float*) mkl_malloc(size*sizeof(float), 4);
+#else
+    *data = (float*) malloc( size*sizeof(float) );
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_deallocate_memory( float* data )
+  {
+#ifdef USE_MKL
+    mkl_free(data);
+#else
+    free(data);
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_allocate_memory( size_t size, double** data )
+  {
+#ifdef USE_MKL
+    *data = (double*) mkl_malloc(size*sizeof(double), 4);
+#else
+    *data = (double*) malloc( size*sizeof(double) );
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_deallocate_memory( double* data )
+  {
+#ifdef USE_MKL
+    mkl_free(data);
+#else
+    free(data);
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_allocate_memory( size_t size, std::complex<float>** data )
+  {
+#ifdef USE_MKL
+    *data = (std::complex<float>*) mkl_malloc(size*sizeof(std::complex<float>), 4);
+#else
+    *data = (std::complex<float>*) malloc( size*sizeof(std::complex<float>) );
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_deallocate_memory( std::complex<float>* data )
+  {
+#ifdef USE_MKL
+    mkl_free(data);
+#else
+    free(data);
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_allocate_memory( size_t size, std::complex<double>** data )
+  {
+#ifdef USE_MKL
+    *data = (std::complex<double>*) mkl_malloc(size*sizeof(std::complex<double>), 4);
+#else
+    *data = (std::complex<double>*) malloc( size*sizeof(std::complex<double>) );
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_deallocate_memory( std::complex<double>* data )
+  {
+#ifdef USE_MKL
+    mkl_free(data);
+#else
+    free(data);
+#endif
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_allocate_memory( size_t size, float_complext** data )
+  {
+    *data = (float_complext*) malloc( size*sizeof(float_complext) );
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_deallocate_memory( float_complext* data )
+  {
+    free( data );
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_allocate_memory( size_t size, double_complext** data )
+  {
+    *data = (double_complext*) malloc( size*sizeof(double_complext) );
+  }
+
+  template <typename T> 
+  inline void hoNDArray<T>::_deallocate_memory( double_complext* data )
+  {
+    free( data );
+  }
+
+  template <typename T> 
+  bool hoNDArray<T>::serialize(char*& buf, size_t& len) const
+  {
+    if ( buf != NULL ) delete[] buf;
+
+    size_t NDim = dimensions_->size();
+
+    // number of dimensions + dimension vector + contents
+    len = sizeof(size_t) + sizeof(size_t)*NDim + sizeof(T)*elements_;
+
+    buf = new char[len];
+
+    memcpy(buf, &NDim, sizeof(size_t));
+    if ( NDim > 0 )
+      {
+        memcpy(buf+sizeof(size_t), &((*dimensions_)[0]), sizeof(size_t)*NDim);
+        memcpy(buf+sizeof(size_t)+sizeof(size_t)*NDim, this->data_, sizeof(T)*elements_);
+      }
+
+    return true; // Temporary. Should not be a boolean function.
+  }
+
+  template <typename T> 
+  bool hoNDArray<T>::deserialize(char* buf, size_t& len)
+  {
+    size_t NDim;
+    memcpy(&NDim, buf, sizeof(size_t));
+
+    if ( NDim > 0 )
+      {
+        std::vector<size_t> dimensions(NDim);
+        memcpy(&dimensions[0], buf+sizeof(size_t), sizeof(size_t)*NDim);
+
+        // allocate memory
+        this->create(&dimensions);
+
+        // copy the content
+        memcpy(this->data_, buf+sizeof(size_t)+sizeof(size_t)*NDim, sizeof(T)*elements_);
+      }
+    else
+      {
+        this->clear();
+      }
+
+    len = sizeof(size_t)+sizeof(size_t)*NDim+sizeof(T)*elements_;
+    return true; // Temporary. Should not be a boolean function.
+  }  
+}
diff --git a/toolboxes/core/cpu/hoNDArray_fileio.h b/toolboxes/core/cpu/hoNDArray_fileio.h
new file mode 100644
index 0000000..6a8e1af
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray_fileio.h
@@ -0,0 +1,65 @@
+#ifndef HONDARRAY_FILEIO_H
+#define HONDARRAY_FILEIO_H
+#pragma once
+
+#include "hoNDArray.h"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string.h>
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+template<class T> int write_nd_array(hoNDArray<T> *a, const char* filename)
+{
+  int* header = new int[a->get_number_of_dimensions()+1];
+
+  header[0] = static_cast<int>(a->get_number_of_dimensions());
+  for (int i = 0; i < header[0]; i++)
+  {
+    header[i+1] = static_cast<int>(a->get_size(i));
+  }
+
+  std::fstream f(filename,std::ios::out | std::ios::binary);
+
+  if( !f.is_open() ){
+    std::cout << "ERROR: Cannot write file " << filename << std::endl;
+    return -1;
+  }
+
+  f.write(reinterpret_cast<char*>(header),sizeof(int)*(a->get_number_of_dimensions()+1));
+  f.write(reinterpret_cast<char*>(a->get_data_ptr()),sizeof(T)*a->get_number_of_elements());
+  
+  f.close();
+
+  delete [] header;
+  
+  return 0;
+}
+
+template <class T> boost::shared_ptr< hoNDArray<T> > read_nd_array(const char* filename)
+{
+  int dimensions,tmp;
+  std::vector<size_t> dim_array;
+  std::fstream f(filename,std::ios::in | std::ios::binary);
+
+  if( !f.is_open() ){
+    std::cout << "ERROR: Cannot open file " << filename << std::endl;
+    return boost::shared_ptr< hoNDArray<T> >();
+  }
+
+  f.read(reinterpret_cast<char*>(&dimensions),sizeof(int));
+  for (int i = 0; i < dimensions; i++)
+  {
+    f.read(reinterpret_cast<char*>(&tmp),sizeof(int));
+    dim_array.push_back(static_cast<size_t>(tmp));
+  }
+
+  boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>(&dim_array) );
+  f.read(reinterpret_cast<char*>(out->get_data_ptr()),sizeof(T)*out->get_number_of_elements());
+  
+  return out;
+}
+}
+#endif
diff --git a/toolboxes/core/cpu/hoNDArray_utils.h b/toolboxes/core/cpu/hoNDArray_utils.h
new file mode 100644
index 0000000..a362439
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDArray_utils.h
@@ -0,0 +1,485 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron {
+
+  class ArrayIterator
+  {
+  public:
+
+    ArrayIterator(std::vector<size_t> *dimensions, std::vector<size_t> *order)
+    {
+      dimensions_  = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+      order_       = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+      current_     = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+      block_sizes_ = boost::shared_ptr< std::vector<size_t> > (new std::vector<size_t>);
+
+      block_sizes_->push_back(1);
+      for (size_t i = 0; i < order->size(); i++) {
+        dimensions_->push_back((*dimensions)[i]);
+        order_->push_back((*order)[i]);
+        current_->push_back(0);
+        if (i > 0) {
+          block_sizes_->push_back((*block_sizes_)[i-1]*(*dimensions_)[i-1]);
+        }
+      }
+      current_idx_ = 0;
+    }
+
+    inline size_t advance()
+    {
+      size_t order_index = 0;
+      (*current_)[(*order_)[order_index]]++;
+      while ((*current_)[(*order_)[order_index]] >= (*dimensions_)[(*order_)[order_index]]) {
+        (*current_)[(*order_)[order_index]] = 0;
+        order_index = (order_index+1)%dimensions_->size();
+        (*current_)[(*order_)[order_index]]++;
+      }
+
+      current_idx_ = 0;
+      for (size_t i = 0; i < dimensions_->size(); i++) {
+        current_idx_ += (*current_)[i]*(*block_sizes_)[i];
+      }	
+      return current_idx_;
+    }
+
+    inline size_t get_current_idx() {
+      return current_idx_;
+    }
+
+    boost::shared_ptr< std::vector<size_t> > get_current_sub() {
+      return current_;
+    }
+
+  protected:
+    boost::shared_ptr< std::vector<size_t> > dimensions_;
+    boost::shared_ptr< std::vector<size_t> > order_;
+    boost::shared_ptr< std::vector<size_t> > current_;
+    boost::shared_ptr< std::vector<size_t> > block_sizes_;
+    size_t current_idx_;
+  };
+
+  template<class T> boost::shared_ptr< hoNDArray<T> > shift_dim( hoNDArray<T> *in, int shift )  
+  {
+    if( in == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid input pointer provided");;
+    }    
+    std::vector<size_t> order;
+    for (size_t i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<size_t>((i+shift)%in->get_number_of_dimensions()));
+    }
+    return permute(in,&order);
+  }
+
+  template<class T> void shift_dim( hoNDArray<T> *in, hoNDArray<T> *out, int shift )
+  {
+    if( in == 0x0 || out == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid pointer provided");;
+    }    
+    std::vector<size_t> order;
+    for (size_t i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<size_t>((i+shift)%in->get_number_of_dimensions()));
+    }
+    permute(in,out,&order);
+  }
+
+  template<class T> boost::shared_ptr< hoNDArray<T> > 
+  permute( hoNDArray<T> *in, std::vector<size_t> *dim_order, int shift_mode = 0) 
+  {
+    if( in == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");;
+    }    
+
+    std::vector<size_t> dims;
+    for (size_t i = 0; i < dim_order->size(); i++)
+      dims.push_back(in->get_dimensions()->at(dim_order->at(i)));
+    boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>() );    
+    out->create(&dims);
+    permute( in, out.get(), dim_order, shift_mode );
+    return out;
+  }
+
+  template<class T> void 
+  permute( hoNDArray<T> *in, hoNDArray<T> *out, std::vector<size_t> *dim_order, int shift_mode = 0) 
+  {
+    if( in == 0x0 || out == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");;
+    }    
+
+    // Check ordering array
+    if (dim_order->size() > in->get_number_of_dimensions()) {
+      throw std::runtime_error("hoNDArray::permute - Invalid length of dimension ordering array");;
+    }
+
+    std::vector<size_t> dim_count(in->get_number_of_dimensions(),0);
+    for (size_t i = 0; i < dim_order->size(); i++) {
+      if ((*dim_order)[i] >= in->get_number_of_dimensions()) {
+        throw std::runtime_error("hoNDArray::permute - Invalid dimension order array");;
+      }
+      dim_count[(*dim_order)[i]]++;
+    }
+
+    // Create an internal array to store the dimensions
+    std::vector<size_t> dim_order_int;
+
+    // Check that there are no duplicate dimensions
+    for (size_t i = 0; i < dim_order->size(); i++) {
+      if (dim_count[(*dim_order)[i]] != 1) {
+        throw std::runtime_error("hoNDArray::permute - Invalid dimension order array (duplicates)");;
+
+      }
+      dim_order_int.push_back((*dim_order)[i]);
+    }
+
+    for (size_t i = 0; i < dim_order_int.size(); i++) {
+      if ((*in->get_dimensions())[dim_order_int[i]] != out->get_size(i)) {
+        throw std::runtime_error("permute(): dimensions of output array do not match the input array");;
+      }
+    }
+
+    // Pad dimension order array with dimension not mentioned in order array
+    if (dim_order_int.size() < in->get_number_of_dimensions()) {
+      for (size_t i = 0; i < dim_count.size(); i++) {
+        if (dim_count[i] == 0) {
+          dim_order_int.push_back(i);
+        }
+      }
+    }
+
+    T* o = out->get_data_ptr();
+
+    ArrayIterator it(in->get_dimensions().get(),&dim_order_int);
+    for (size_t i = 0; i < in->get_number_of_elements(); i++) {
+      o[i] = in->get_data_ptr()[it.get_current_idx()];
+      it.advance();
+    }
+  }
+   
+  // Expand array to new dimension
+  template<class T> boost::shared_ptr<hoNDArray<T> > 
+  expand(hoNDArray<T> *in, size_t new_dim_size )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("expand(): illegal input pointer.");;
+    }
+      
+    const size_t number_of_elements_in = in->get_number_of_elements();    
+
+    std::vector<size_t> dims = *in->get_dimensions(); 
+    dims.push_back(new_dim_size);
+
+    boost::shared_ptr< hoNDArray<T> > out(new hoNDArray<T>(&dims));
+      
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long int idx=0; idx<number_of_elements_in*new_dim_size; idx++ ){
+      (*out)[idx] = in->at(idx%number_of_elements_in);
+    }
+    return out;
+  }
+  
+  // Sum over dimension
+  template<class T> boost::shared_ptr<hoNDArray<T> > 
+  sum(hoNDArray<T> *in, size_t dim )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("sum(): illegal input pointer.");;
+    }
+
+    if( !(in->get_number_of_dimensions()>1) ){
+      throw std::runtime_error("sum(): underdimensioned.");;
+    }
+
+    if( dim > in->get_number_of_dimensions()-1 ){
+      throw std::runtime_error( "sum(): dimension out of range.");;
+    }
+
+    size_t number_of_batches = in->get_size(dim);
+    size_t number_of_elements = in->get_number_of_elements()/number_of_batches;
+    std::vector<size_t> dims = *in->get_dimensions(); dims.pop_back();
+
+    boost::shared_ptr< hoNDArray<T> > out(new hoNDArray<T>());
+    out->create(&dims);
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long idx=0; idx<(long long)number_of_elements; idx++ ){
+      T val(0);
+      for( size_t j=0; j<number_of_batches; j++ ){
+        size_t in_idx = j*number_of_elements+idx;
+        val += in->get_data_ptr()[in_idx];      
+      }
+      out->get_data_ptr()[idx] = val;       
+    }
+    return out;
+  } 
+
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  crop( const vector_td<size_t, D>& crop_offset, const vector_td<size_t, D>& crop_size, hoNDArray<T> *in )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("crop: 0x0 array provided");;
+    }
+
+    if( in->get_number_of_dimensions() < D ){
+      std::stringstream ss;
+      ss << "crop: number of image dimensions should be at least " << D;
+      throw std::runtime_error(ss.str());;
+    }
+
+    std::vector<size_t> dims = to_std_vector(crop_size);
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      dims.push_back(in->get_size(d));
+    }
+    boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>(&dims) );
+
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    size_t num_batches = 1;
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      num_batches *= in->get_size(d);
+    }
+
+    if( weak_greater(crop_offset+matrix_size_out, matrix_size_in) ){
+      throw std::runtime_error( "crop: cropping size mismatch");;
+    }
+
+    const size_t num_elements_in = prod(matrix_size_in);
+    const size_t num_elements_out = prod(matrix_size_out);
+
+    T *in_ptr = in->get_data_ptr();
+    T *out_ptr = out->get_data_ptr();
+
+    for( size_t frame_offset=0; frame_offset<num_batches; frame_offset++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx=0; idx<(long long) num_elements_out; idx++ ){
+        const typename uint64d<D>::Type co = idx_to_co<D>( idx, matrix_size_out );
+        const typename uint64d<D>::Type co_os = crop_offset + co;
+        const size_t in_idx = co_to_idx<D>(co_os, matrix_size_in)+frame_offset*num_elements_in;
+        out_ptr[idx+frame_offset*num_elements_out] = in_ptr[in_idx];
+      }
+    }
+    return out;
+  }    
+
+  /**
+   * @param[in] size Size of the output array
+   * @param[in] in Input array
+   * @param[in] val Value to use for padding
+   * @returns New array of the specified size, containing the original input array in the center and val outside.
+   */
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  pad( const typename uint64d<D>::Type& size, hoNDArray<T> *in, T val = T(0) )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("pad: 0x0 array provided");;
+    }
+
+    if( in->get_number_of_dimensions() < D ){
+      std::stringstream ss;
+      ss << "pad: number of image dimensions should be at least " << D;
+      throw std::runtime_error(ss.str());;
+    }
+
+    std::vector<size_t> dims = to_std_vector(size);
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      dims.push_back(in->get_size(d));
+    }
+    boost::shared_ptr< hoNDArray<T> > out( new hoNDArray<T>(&dims) );
+
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    size_t num_batches = 1;
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      num_batches *= in->get_size(d);
+    }
+
+    if( weak_greater(matrix_size_in,matrix_size_out) ){
+      throw std::runtime_error("pad: size mismatch, cannot expand");
+    }
+
+    const size_t num_elements_in = prod(matrix_size_in);
+    const size_t num_elements_out = prod(matrix_size_out);
+    const typename uint64d<D>::Type offset = (matrix_size_out-matrix_size_in)>>1;
+
+    T *in_ptr = in->get_data_ptr();
+    T *out_ptr = out->get_data_ptr();
+
+    for( size_t frame_offset=0; frame_offset<num_batches; frame_offset++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx=0; idx<(long long)num_elements_out; idx++ ){
+        const typename uint64d<D>::Type co_out = idx_to_co<D>( idx, matrix_size_out );
+        T _out;
+        bool inside = (co_out>=offset) && (co_out<(matrix_size_in+offset));
+
+        if( inside )
+          _out = in_ptr[co_to_idx<D>( co_out-offset, matrix_size_in)+frame_offset*num_elements_in];
+        else{
+          _out = val;
+        }
+        out_ptr[idx+frame_offset*num_elements_out] = _out;
+      }
+    }
+    return out;
+  }
+
+  template<typename T> 
+  bool permuteFirstTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r)
+  {
+    try
+      {
+        size_t NDim = x.get_number_of_dimensions();
+        if ( NDim == 1 )
+          {
+            r = x;
+            return true;
+          }
+
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t numOfPermute =  x.get_number_of_elements()/(RO*E1);
+
+        std::vector<size_t> dimR(NDim);
+        dimR = *dimX;
+        dimR[0] = E1;
+        dimR[1] = RO;
+
+        if ( r.dimensions_equal(&dimR) )
+          {
+            r.create(dimR);
+          }
+
+        int n;
+
+#pragma omp parallel for default(none) private(n) shared(RO, E1, numOfPermute, x, r)
+        for ( n=0; n<(int)numOfPermute; n++ )
+          {
+            const T* pX = x.begin() + n*RO*E1;
+            T* pR = r.begin() + n*RO*E1;
+
+            for ( size_t e=0; e<E1; e++ )
+              {
+                for ( size_t r=0; r<RO; r++ )
+                  {
+                    pR[e+r*E1] = pX[r+e*RO];
+                  }
+              }
+          }
+      }
+    catch (...)
+      {
+        GADGET_ERROR_MSG("Errors in permuteFirstTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+        return false;
+      }
+    return true;
+  }
+
+  template<typename T> 
+  bool permuteLastTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r)
+  {
+    try
+      {
+        size_t NDim = x.get_number_of_dimensions();
+        if ( NDim == 1 )
+          {
+            r = x;
+            return true;
+          }
+
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        size_t lastDim = x.get_size(NDim-1);
+        size_t secondLastDim = x.get_size(NDim-2);
+        size_t N =  x.get_number_of_elements()/(lastDim*secondLastDim);
+
+        std::vector<size_t> dimR(NDim);
+        dimR = *dimX;
+        dimR[NDim-2] = lastDim;
+        dimR[NDim-1] = secondLastDim;
+
+        if ( !r.dimensions_equal(&dimR) )
+          {
+            r.create(dimR);
+          }
+
+        int l;
+
+#ifdef GCC_OLD_FLAG
+#pragma omp parallel for default(none) private(l) shared(lastDim, secondLastDim, N)
+#else
+#pragma omp parallel for default(none) private(l) shared(lastDim, secondLastDim, x, r, N)
+#endif
+        for ( l=0; l<(int)lastDim; l++ )
+          {
+            for ( size_t sl=0; sl<secondLastDim; sl++ )
+              {
+                const T* pX = x.begin() + sl*N + l*N*secondLastDim;
+                T* pR = r.begin() + l*N + sl*N*lastDim;
+                memcpy(pR, pX, sizeof(T)*N);
+              }
+          }
+      }
+    catch (...)
+      {
+        GADGET_ERROR_MSG("Errors in permuteLastTwoDimensions(const hoNDArray<T>& x, hoNDArray<T>& r) ... ");
+        return false;
+      }
+    return true;
+  }
+
+  /// copy the sub array x(:, indLastDim) to all other places of the last dimensions
+  template<typename T> 
+  bool repmatLastDimension(hoNDArray<T>& x, size_t indLastDim)
+  {
+    try
+      {
+        size_t NDim = x.get_number_of_dimensions();
+        size_t lastDim = x.get_size(NDim-1);
+        GADGET_CHECK_RETURN_FALSE( indLastDim < lastDim );
+
+        std::vector<size_t> ind(NDim, 0);
+        ind[NDim-1] = indLastDim;
+        int offsetIndLastDim = x.calculate_offset(ind);
+
+        size_t N = x.get_number_of_elements() / lastDim;
+
+        int l;
+#ifdef GCC_OLD_FLAG
+#pragma omp parallel for default(none) private(l) shared(lastDim, offsetIndLastDim, ind, indLastDim, N, NDim)
+#else
+#pragma omp parallel for default(none) private(l) shared(lastDim, offsetIndLastDim, x, ind, indLastDim, N, NDim)
+#endif
+        for ( l=0; l<(int)lastDim; l++ )
+          {
+            if ( l==indLastDim ) continue;
+            ind[NDim-1] = l;
+            int offsetInd = x.calculate_offset(ind);
+
+            memcpy(x.begin()+offsetInd, x.begin()+offsetIndLastDim, sizeof(T)*N);
+          }
+      }
+    catch (...)
+      {
+        GADGET_ERROR_MSG("Errors in repmatLastDimension(hoNDArray<T>& x, size_t indLastDim) ... ");
+        return false;
+      }
+    return true;
+  }
+
+}
diff --git a/toolboxes/core/cpu/hoNDFFT.cpp b/toolboxes/core/cpu/hoNDFFT.cpp
new file mode 100644
index 0000000..3a0f431
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDFFT.cpp
@@ -0,0 +1,1713 @@
+/*
+ * hoNDFFT.cpp
+ *
+ *  Created on: Nov 29, 2011
+ *      Author: hansenms
+ */
+
+#include "hoNDFFT.h"
+#include "hoMatrix.h"
+
+namespace Gadgetron{
+
+  template<typename T> hoNDFFT<T>* hoNDFFT<T>::instance()
+  {
+    if (!instance_) instance_ = new hoNDFFT<T>();
+    return instance_;
+  }
+  
+  template<class T> hoNDFFT<T>* hoNDFFT<T>::instance_ = NULL;
+
+  template<class T> void hoNDFFT<T>::fft_int(hoNDArray< std::complex<T> >* input, size_t dim_to_transform, int sign)
+  {
+    if (sign != -1 && sign != 1) return;
+    if (dim_to_transform >= input->get_number_of_dimensions()) return;
+
+    int stride     = 1;           //Distance between points in transform
+    int dist       = 1;           //Distance between vectors
+    int trafos     = 1;           //Transformations per chunk
+    int chunks     = 1;           //Number of chunks
+    int chunk_size = 1;           //Points per chunk
+    int length     = 1;           //Length of each transform
+    int total_dist = 1;
+
+    T scale = 0.0;
+
+    void* fft_plan        = 0;
+    T*    fft_storage     = 0;
+
+    T* fft_buffer = 0;
+    T* data_ptr = 0;
+
+    //Set sizes
+    length = input->get_size(dim_to_transform);
+
+    if (sign == 1)
+      {
+        scale = 1.0/length;
+      }
+    else
+      {
+        scale = 1.0;
+      }
+
+    if (dim_to_transform != 0)
+      {
+        for (size_t i = 0; i < dim_to_transform; i++)
+          {
+            chunk_size *= input->get_size(i);
+          }
+        stride = chunk_size;
+        trafos = chunk_size;
+        chunk_size *= length;
+
+        for (size_t i = dim_to_transform+1; i < input->get_number_of_dimensions(); i++)
+          {
+            chunks *= input->get_size(i);
+          }
+      }
+    else
+      {
+        for (size_t i = 1; i < input->get_number_of_dimensions(); i++)
+          {
+            trafos *= input->get_size(i);
+          }
+        chunk_size = trafos*length;
+
+        dist = length;
+      }
+
+    //*2 real and imag
+    chunk_size *= 2;
+    dist *= 2;
+    total_dist = trafos*dist;
+
+
+    //Allocate storage and make plan
+    {
+      mutex_.lock();
+      fft_storage = (T*)fftw_malloc_ptr_(sizeof(T)*length*2);
+      if (fft_storage == 0)
+        {
+          std::cout << "Failed to allocate buffer for FFT" << std::endl;
+          return;
+        }
+      fft_buffer = (T*)fft_storage;
+
+      unsigned planner_flags = FFTW_MEASURE | FFTW_DESTROY_INPUT;
+
+      fft_plan = fftw_plan_dft_1d_ptr_(length, fft_storage, fft_storage, sign, planner_flags);
+
+      if (fft_plan == 0)
+        {
+          fftw_free_ptr_(fft_storage);
+          std::cout << "Failed to create plan for FFT" << std::endl;
+          return;
+        }
+      mutex_.unlock();
+    }
+
+    //Grab address of data
+    data_ptr = reinterpret_cast<T*>(input->get_data_ptr());
+
+    register int idx1_max = chunks*chunk_size;
+    register int idx1, idx2;       //Index variables
+    register int idx2_limit;
+    register int middle_point = ((length+1)>>1)<<1;
+    register int length2 = length<<1;
+    register int stride2 = stride<<1;
+
+    for (idx1 = 0; idx1 < idx1_max; idx1+=chunk_size) //Loop over all chunks
+      {
+        idx2_limit = idx1+total_dist;
+        for (idx2 = idx1; idx2 < idx2_limit; idx2+=dist) //Loop over all transformations
+          {
+            ///Copy data to buffer.
+            {
+              register int j, idx3 = idx2;
+              for (j = middle_point; j < length2; idx3+=stride2)
+                {
+                  fft_buffer[j++] = data_ptr[idx3  ];
+                  fft_buffer[j++] = data_ptr[idx3+1];
+                }
+              for (j = 0; j < middle_point; idx3+=stride2)
+                {
+                  fft_buffer[j++] = data_ptr[idx3  ];
+                  fft_buffer[j++] = data_ptr[idx3+1];
+                }
+            }
+
+            fftw_execute_ptr_(fft_plan);
+
+            {
+              register int j, idx3 = idx2;
+
+              for (j = middle_point; j < length2; idx3+=stride2)
+                {
+                  data_ptr[idx3  ] = fft_buffer[j++]*scale;
+                  data_ptr[idx3+1] = fft_buffer[j++]*scale;
+                }
+              for (j = 0; j < middle_point; idx3+=stride2)
+                {
+                  data_ptr[idx3  ] = fft_buffer[j++]*scale;
+                  data_ptr[idx3+1] = fft_buffer[j++]*scale;
+                }
+            }
+
+          } //Loop over transformations
+      } //Loop over chunks
+
+    //clean up
+    {
+      mutex_.lock();
+      if (fft_plan != 0)
+        {
+          fftw_destroy_plan_ptr_(fft_plan);
+        }
+
+      if (fft_storage != 0)
+        {
+          fftw_free_ptr_(fft_storage);
+        }
+      mutex_.unlock();
+    }
+  }
+  
+  template<> void hoNDFFT<float>::set_function_pointers()
+  {
+    fftw_import_wisdom_from_file_ptr_ = &fftwf_import_wisdom_from_file;
+    fftw_export_wisdom_to_file_ptr_ = &fftwf_export_wisdom_to_file;
+    fftw_cleanup_ptr_ = &fftwf_cleanup;
+    fftw_malloc_ptr_ = &fftwf_malloc;
+    fftw_free_ptr_ = &fftwf_free;
+    fftw_execute_ptr_ = (void (*)(void*))(&fftwf_execute);
+    fftw_plan_dft_1d_ptr_ = (void* (*)(int, void*, void*, int, unsigned))(&fftwf_plan_dft_1d);
+    fftw_destroy_plan_ptr_ = (void (*)(void*))(&fftwf_destroy_plan);
+  }
+
+  template<> void hoNDFFT<double>::set_function_pointers()
+  {
+    fftw_import_wisdom_from_file_ptr_ = &fftw_import_wisdom_from_file;
+    fftw_export_wisdom_to_file_ptr_ = &fftw_export_wisdom_to_file;
+    fftw_cleanup_ptr_ = &fftw_cleanup;
+    fftw_malloc_ptr_ = &fftw_malloc;
+    fftw_free_ptr_ = &fftw_free;
+    fftw_execute_ptr_ = (void (*)(void*))(&fftw_execute);
+    fftw_plan_dft_1d_ptr_ = (void* (*)(int, void*, void*, int, unsigned))(&fftw_plan_dft_1d);
+    fftw_destroy_plan_ptr_ = (void (*)(void*))(&fftw_destroy_plan);
+  }
+
+  template<typename T> 
+  inline size_t hoNDFFT<T>::fftshiftPivot(size_t x)
+  {
+    return (size_t)(ceil(x*0.5));
+  }
+
+  template<typename T> 
+  inline size_t hoNDFFT<T>::ifftshiftPivot(size_t x)
+  {
+    return (size_t)(floor(x*0.5));
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot)
+  {
+    try
+      {
+        memcpy(r, a+pivot, sizeof(ComplexType)*(x-pivot));
+        memcpy(r+x-pivot, a, sizeof(ComplexType)*pivot);
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot)
+  {
+    return fftshift1D(a, r, x, pivot);
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshiftPivot1D(ComplexType* a, size_t x, size_t n, size_t pivot)
+  {
+    try
+      {
+        long long counter;
+
+#pragma omp parallel private(counter) shared(n, x, pivot, a)
+        {
+          hoNDArray< ComplexType > aTmp(x);
+
+#pragma omp for
+          for ( counter=0; counter<(long long)n; counter++ )
+            {
+              fftshift1D(a+counter*x, aTmp.begin(), x, pivot);
+              memcpy(a+counter*x, aTmp.begin(), sizeof(ComplexType)*x);
+            }
+        }
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshiftPivot1D(ComplexType* a, size_t x, size_t n, size_t pivot) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshiftPivot1D(const ComplexType* a, ComplexType* r, size_t x, size_t n, size_t pivot)
+  {
+    try
+      {
+        long long counter;
+
+#pragma omp parallel for private(counter) shared(n, x, pivot, a, r)
+        for ( counter=0; counter<(long long)n; counter++ )
+          {
+            fftshift1D(a+counter*x, r+counter*x, x, pivot);
+          }
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshiftPivot1D(const ComplexType* a, ComplexType* r, size_t x, size_t n, size_t pivot) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshift1D(hoNDArray< ComplexType >& a)
+  {
+    try
+      {
+        size_t x = a.get_size(0);
+        size_t pivot = fftshiftPivot(x);
+        size_t numOfShifts = a.get_number_of_elements()/x;
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot1D(a.begin(), x, numOfShifts, pivot));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift1D(hoNDArray< ComplexType >& a) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    try
+      {
+        if ( !r.dimensions_equal(&a) )
+          {
+            r = a;
+          }
+
+        size_t x = a.get_size(0);
+        size_t pivot = fftshiftPivot(x);
+        size_t numOfShifts = a.get_number_of_elements()/x;
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot1D(a.begin(), r.begin(), x, numOfShifts, pivot));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::ifftshift1D(hoNDArray< ComplexType >& a)
+  {
+    try
+      {
+        size_t x = a.get_size(0);
+        size_t pivot = ifftshiftPivot(x);
+        size_t numOfShifts = a.get_number_of_elements()/x;
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot1D(a.begin(), x, numOfShifts, pivot));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::ifftshift1D(hoNDArray< ComplexType >& a) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::ifftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    try
+      {
+        if ( !r.dimensions_equal(&a) )
+          {
+            r = a;
+          }
+
+        size_t x = a.get_size(0);
+        size_t pivot = ifftshiftPivot(x);
+        size_t numOfShifts = a.get_number_of_elements()/x;
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot1D(a.begin(), r.begin(), x, numOfShifts, pivot));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::ifftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshiftPivot2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n, unsigned pivotx, unsigned pivoty)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+        GADGET_CHECK_RETURN_FALSE(r!=NULL);
+
+        long long tt;
+
+#pragma omp parallel for private(tt) shared(a, r, x, y, n, pivotx, pivoty) if (n>1)
+        for ( tt=0; tt<(long long)n; tt++ )
+          {
+            const ComplexType* ac = a + tt*x*y;
+            ComplexType* rc = r + tt*x*y;
+
+            size_t ay, ry;
+
+            for ( ay=pivoty; ay<y; ay++ )
+              {
+                ry = ay - pivoty;
+                memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+              }
+
+            for ( ay=0; ay<pivoty; ay++ )
+              {
+                ry = ay + y - pivoty;
+                memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+              }
+          }
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshiftPivot2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n, unsigned pivotx, unsigned pivoty) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshiftPivot2D(ComplexType* a, size_t x, size_t y, size_t n, unsigned pivotx, unsigned pivoty)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+
+        long long tt;
+
+#pragma omp parallel private(tt) shared(a, x, y, n, pivotx, pivoty) if (n>1)
+        {
+          hoNDArray< ComplexType > aTmp(x*y);
+          ComplexType* rc = aTmp.begin();
+
+#pragma omp for
+          for ( tt=0; tt<(long long)n; tt++ )
+            {
+              ComplexType* ac = a + tt*x*y;
+
+              size_t ay, ry;
+
+              for ( ay=pivoty; ay<y; ay++ )
+                {
+                  ry = ay - pivoty;
+                  memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                  memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                }
+
+              for ( ay=0; ay<pivoty; ay++ )
+                {
+                  ry = ay + y - pivoty;
+                  memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                  memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                }
+
+              memcpy(ac, rc, sizeof(ComplexType)*x*y);
+            }
+        }
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshiftPivot2D(ComplexType* a, size_t x, size_t y, size_t n, unsigned pivotx, unsigned pivoty) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+        GADGET_CHECK_RETURN_FALSE(r!=NULL);
+
+        unsigned pivotx = fftshiftPivot(x);
+        unsigned pivoty = fftshiftPivot(y);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot2D(a, r, x, y, n, pivotx, pivoty));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+        GADGET_CHECK_RETURN_FALSE(r!=NULL);
+
+        unsigned pivotx = ifftshiftPivot(x);
+        unsigned pivoty = ifftshiftPivot(y);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot2D(a, r, x, y, n, pivotx, pivoty));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::ifftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift2D(ComplexType* a, size_t x, size_t y, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+
+        unsigned pivotx = fftshiftPivot(x);
+        unsigned pivoty = fftshiftPivot(y);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot2D(a, x, y, n, pivotx, pivoty));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift2D(ComplexType* a, size_t x, size_t y, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift2D(ComplexType* a, size_t x, size_t y, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+
+        unsigned pivotx = ifftshiftPivot(x);
+        unsigned pivoty = ifftshiftPivot(y);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot2D(a, x, y, n, pivotx, pivoty));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::ifftshift2D(ComplexType* a, size_t x, size_t y, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift2D(hoNDArray< ComplexType >& a)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+    return fftshift2D(a.begin(), a.get_size(0), a.get_size(1), n);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    if ( !r.dimensions_equal(&a) )
+      {
+        r = a;
+      }
+
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+    return fftshift2D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), n);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift2D(hoNDArray< ComplexType >& a)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+    return ifftshift2D(a.begin(), a.get_size(0), a.get_size(1), n);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    if ( !r.dimensions_equal(&a) )
+      {
+        r = a;
+      }
+
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+    return ifftshift2D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), n);
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshiftPivot3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n, unsigned pivotx, unsigned pivoty,  unsigned pivotz)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+        GADGET_CHECK_RETURN_FALSE(r!=NULL);
+
+        long long tt;
+
+#pragma omp parallel for private(tt) shared(a, r, x, y, z, n, pivotx, pivoty, pivotz) if (n>1)
+        for ( tt=0; tt<(long long)n; tt++ )
+          {
+            size_t ay, ry, az, rz;
+
+            for ( az=pivotz; az<z; az++ )
+              {
+                rz = az - pivotz;
+
+                const ComplexType* ac = a + tt*x*y*z + az*x*y;
+                ComplexType* rc = r + tt*x*y*z + rz*x*y;
+
+                for ( ay=pivoty; ay<y; ay++ )
+                  {
+                    ry = ay - pivoty;
+                    memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                    memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                  }
+
+                for ( ay=0; ay<pivoty; ay++ )
+                  {
+                    ry = ay + y - pivoty;
+                    memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                    memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                  }
+              }
+
+            for ( az=0; az<pivotz; az++ )
+              {
+                rz = az + z - pivotz;
+
+                const ComplexType* ac = a + tt*x*y*z + az*x*y;
+                ComplexType* rc = r + tt*x*y*z + rz*x*y;
+
+                for ( ay=pivoty; ay<y; ay++ )
+                  {
+                    ry = ay - pivoty;
+                    memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                    memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                  }
+
+                for ( ay=0; ay<pivoty; ay++ )
+                  {
+                    ry = ay + y - pivoty;
+                    memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                    memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                  }
+              }
+          }
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshiftPivot3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n, unsigned pivotx, unsigned pivoty,  unsigned pivotz) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fftshiftPivot3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n, unsigned pivotx, unsigned pivoty,  unsigned pivotz)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+
+        long long tt;
+
+#pragma omp parallel private(tt) shared(a, x, y, z, n, pivotx, pivoty, pivotz) if (n>1)
+        {
+          hoNDArray< ComplexType > aTmp(x*y*z);
+
+#pragma omp for
+          for ( tt=0; tt<(long long)n; tt++ )
+            {
+              size_t ay, ry, az, rz;
+
+              for ( az=pivotz; az<z; az++ )
+                {
+                  rz = az - pivotz;
+
+                  const ComplexType* ac = a + tt*x*y*z + az*x*y;
+                  ComplexType* rc = aTmp.begin() + rz*x*y;
+
+                  for ( ay=pivoty; ay<y; ay++ )
+                    {
+                      ry = ay - pivoty;
+                      memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                      memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                    }
+
+                  for ( ay=0; ay<pivoty; ay++ )
+                    {
+                      ry = ay + y - pivoty;
+                      memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                      memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                    }
+                }
+
+              for ( az=0; az<pivotz; az++ )
+                {
+                  rz = az + z - pivotz;
+
+                  const ComplexType* ac = a + tt*x*y*z + az*x*y;
+                  ComplexType* rc = aTmp.begin() + rz*x*y;
+
+                  for ( ay=pivoty; ay<y; ay++ )
+                    {
+                      ry = ay - pivoty;
+                      memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                      memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                    }
+
+                  for ( ay=0; ay<pivoty; ay++ )
+                    {
+                      ry = ay + y - pivoty;
+                      memcpy(rc+ry*x, ac+ay*x+pivotx, sizeof(ComplexType)*(x-pivotx));
+                      memcpy(rc+ry*x+x-pivotx, ac+ay*x, sizeof(ComplexType)*pivotx);
+                    }
+                }
+
+              memcpy(a+tt*x*y*z, aTmp.begin(), sizeof(ComplexType)*x*y*z);
+            }
+        }
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshiftPivot3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n, unsigned pivotx, unsigned pivoty,  unsigned pivotz) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+        GADGET_CHECK_RETURN_FALSE(r!=NULL);
+
+        unsigned pivotx = fftshiftPivot(x);
+        unsigned pivoty = fftshiftPivot(y);
+        unsigned pivotz = fftshiftPivot(z);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot3D(a, r, x, y, z, n, pivotx, pivoty, pivotz));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+        GADGET_CHECK_RETURN_FALSE(r!=NULL);
+
+        unsigned pivotx = ifftshiftPivot(x);
+        unsigned pivoty = ifftshiftPivot(y);
+        unsigned pivotz = ifftshiftPivot(z);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot3D(a, r, x, y, z, n, pivotx, pivoty, pivotz));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::ifftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+
+        unsigned pivotx = fftshiftPivot(x);
+        unsigned pivoty = fftshiftPivot(y);
+        unsigned pivotz = fftshiftPivot(z);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot3D(a, x, y, z, n, pivotx, pivoty, pivotz));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::fftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n)
+  {
+    try
+      {
+        GADGET_CHECK_RETURN_FALSE(a!=NULL);
+
+        unsigned pivotx = ifftshiftPivot(x);
+        unsigned pivoty = ifftshiftPivot(y);
+        unsigned pivotz = ifftshiftPivot(z);
+
+        GADGET_CHECK_RETURN_FALSE(fftshiftPivot3D(a, x, y, z, n, pivotx, pivoty, pivotz));
+      }
+    catch(...)
+      {
+        GADGET_ERROR_MSG("Errors in hoNDFFT<T>::ifftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n) ...");
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift3D(hoNDArray< ComplexType >& a)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+    return fftshift3D(a.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    if ( !r.dimensions_equal(&a) )
+      {
+        r = a;
+      }
+
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+    return fftshift3D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift3D(hoNDArray< ComplexType >& a)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+    return ifftshift3D(a.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    if ( !r.dimensions_equal(&a) )
+      {
+        r = a;
+      }
+
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+    return ifftshift3D(a.begin(), r.begin(), a.get_size(0), a.get_size(1), a.get_size(2), n);
+  }
+
+  // -----------------------------------------------------------------------------------------
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a)
+  {
+    return fft1(a, true);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft1(hoNDArray< ComplexType >& a)
+  {
+    return fft1(a, false);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    if ( !r.dimensions_equal(&a) )
+      {
+        r.create(a.get_dimensions());
+      }
+
+    return fft1(const_cast<hoNDArray< ComplexType >&>(a), r, true);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    if ( !r.dimensions_equal(&a) )
+      {
+        r.create(a.get_dimensions());
+      }
+
+    return fft1(const_cast<hoNDArray< ComplexType >&>(a), r, false);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft1c(hoNDArray< ComplexType >& a)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift1D(a));
+    GADGET_CHECK_RETURN_FALSE(fft1(a));
+    GADGET_CHECK_RETURN_FALSE(fftshift1D(a));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft1c(hoNDArray< ComplexType >& a)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift1D(a));
+    GADGET_CHECK_RETURN_FALSE(ifft1(a));
+    GADGET_CHECK_RETURN_FALSE(fftshift1D(a));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift1D(a, r));
+    GADGET_CHECK_RETURN_FALSE(fft1(r));
+    GADGET_CHECK_RETURN_FALSE(fftshift1D(r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift1D(a, r));
+    GADGET_CHECK_RETURN_FALSE(ifft1(r));
+    GADGET_CHECK_RETURN_FALSE(fftshift1D(r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift1D(a, r));
+    GADGET_CHECK_RETURN_FALSE(fft1(r, buf));
+    GADGET_CHECK_RETURN_FALSE(fftshift1D(buf, r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift1D(a, r));
+    GADGET_CHECK_RETURN_FALSE(ifft1(r, buf));
+    GADGET_CHECK_RETURN_FALSE(fftshift1D(buf, r));
+    return true;
+  }
+
+  // -----------------------------------------------------------------------------------------
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a)
+  {
+    return fft2(a, true);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft2(hoNDArray< ComplexType >& a)
+  {
+    return fft2(a, false);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    //r = a;
+    //return fft2(r);
+    if ( !r.dimensions_equal(&a) )
+      {
+        r.create(a.get_dimensions());
+      }
+
+    return fft2(const_cast<hoNDArray< ComplexType >&>(a), r, true);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    /*r = a;
+      return ifft2(r);*/
+
+    if ( !r.dimensions_equal(&a) )
+      {
+        r.create(a.get_dimensions());
+      }
+
+    return fft2(const_cast<hoNDArray< ComplexType >&>(a), r, false);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft2c(hoNDArray< ComplexType >& a)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift2D(a));
+    GADGET_CHECK_RETURN_FALSE(fft2(a));
+    GADGET_CHECK_RETURN_FALSE(fftshift2D(a));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft2c(hoNDArray< ComplexType >& a)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift2D(a));
+    GADGET_CHECK_RETURN_FALSE(ifft2(a));
+    GADGET_CHECK_RETURN_FALSE(fftshift2D(a));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift2D(a, r));
+    GADGET_CHECK_RETURN_FALSE(fft2(r));
+    GADGET_CHECK_RETURN_FALSE(fftshift2D(r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift2D(a, r));
+    GADGET_CHECK_RETURN_FALSE(ifft2(r));
+    GADGET_CHECK_RETURN_FALSE(fftshift2D(r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift2D(a, r));
+    GADGET_CHECK_RETURN_FALSE(fft2(r, buf));
+    GADGET_CHECK_RETURN_FALSE(fftshift2D(buf, r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift2D(a, r));
+    GADGET_CHECK_RETURN_FALSE(ifft2(r, buf));
+    GADGET_CHECK_RETURN_FALSE(fftshift2D(buf, r));
+    return true;
+  }
+
+  // -----------------------------------------------------------------------------------------
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a)
+  {
+    return fft3(a, true);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft3(hoNDArray< ComplexType >& a)
+  {
+    return fft3(a, false);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    /*r = a;
+      return fft3(r);*/
+    if ( !r.dimensions_equal(&a) )
+      {
+        r.create(a.get_dimensions());
+      }
+
+    return fft3(const_cast<hoNDArray< ComplexType >&>(a), r, true);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    /*r = a;
+      return ifft3(r);*/
+    if ( !r.dimensions_equal(&a) )
+      {
+        r.create(a.get_dimensions());
+      }
+
+    return fft3(const_cast<hoNDArray< ComplexType >&>(a), r, false);
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft3c(hoNDArray< ComplexType >& a)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift3D(a));
+    GADGET_CHECK_RETURN_FALSE(fft3(a));
+    GADGET_CHECK_RETURN_FALSE(fftshift3D(a));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft3c(hoNDArray< ComplexType >& a)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift3D(a));
+    GADGET_CHECK_RETURN_FALSE(ifft3(a));
+    GADGET_CHECK_RETURN_FALSE(fftshift3D(a));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift3D(a, r));
+    GADGET_CHECK_RETURN_FALSE(fft3(r));
+    GADGET_CHECK_RETURN_FALSE(fftshift3D(r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift3D(a, r));
+    GADGET_CHECK_RETURN_FALSE(ifft3(r));
+    GADGET_CHECK_RETURN_FALSE(fftshift3D(r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift3D(a, r));
+    GADGET_CHECK_RETURN_FALSE(fft3(r, buf));
+    GADGET_CHECK_RETURN_FALSE(fftshift3D(buf, r));
+    return true;
+  }
+
+  template<typename T> 
+  inline bool hoNDFFT<T>::ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf)
+  {
+    GADGET_CHECK_RETURN_FALSE(ifftshift3D(a, r));
+    GADGET_CHECK_RETURN_FALSE(ifft3(r, buf));
+    GADGET_CHECK_RETURN_FALSE(fftshift3D(buf, r));
+    return true;
+  }
+
+  // -----------------------------------------------------------------------------------------
+
+  // MKL related
+
+#ifdef USE_MKL
+
+  template<typename T> 
+  bool hoNDFFT<T>::configureFFTHandle(long long NDim, MKL_LONG* dim, DFTI_CONFIG_VALUE fftPresion, size_t n, DFTI_DESCRIPTOR_HANDLE& handle)
+  {
+    long long ii;
+
+    MKL_LONG res;
+
+    if ( NDim == 1 )
+      {
+        if ( (res=DftiCreateDescriptor( &handle, fftPresion, DFTI_COMPLEX, NDim, dim[0])) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+      }
+    else
+      {
+        if ( (res=DftiCreateDescriptor( &handle, fftPresion, DFTI_COMPLEX, NDim, dim)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+      }
+
+    double fftScaling = 1.0;
+    for ( ii=0; ii<NDim; ii++ )
+      {
+        fftScaling *= dim[ii];
+      }
+
+    if ( (res=DftiSetValue( handle, DFTI_FORWARD_SCALE, 1.0/std::sqrt(fftScaling))) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    if ( (res=DftiSetValue( handle, DFTI_BACKWARD_SCALE, 1.0/std::sqrt(fftScaling))) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    if ( (res=DftiSetValue( handle, DFTI_PLACEMENT, DFTI_INPLACE)) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    if ( n > 1 )
+      {
+        if ( (res=DftiSetValue( handle, DFTI_NUMBER_OF_TRANSFORMS, n)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+
+        if ( (res=DftiSetValue( handle, DFTI_INPUT_DISTANCE, (MKL_INT)fftScaling)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+
+        if ( (res=DftiSetValue( handle, DFTI_OUTPUT_DISTANCE, (MKL_INT)fftScaling)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+      }
+
+    if ( (res=DftiCommitDescriptor( handle)) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::configureFFTHandleOutOfPlace(long long NDim, MKL_LONG* dim, DFTI_CONFIG_VALUE fftPresion, size_t n, DFTI_DESCRIPTOR_HANDLE& handle)
+  {
+    long long ii;
+
+    MKL_LONG res;
+
+    if ( NDim == 1 )
+      {
+        if ( (res=DftiCreateDescriptor( &handle, fftPresion, DFTI_COMPLEX, NDim, dim[0])) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+      }
+    else
+      {
+        if ( (res=DftiCreateDescriptor( &handle, fftPresion, DFTI_COMPLEX, NDim, dim)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+      }
+
+    double fftScaling = 1.0;
+    for ( ii=0; ii<NDim; ii++ )
+      {
+        fftScaling *= dim[ii];
+      }
+
+    if ( (res=DftiSetValue( handle, DFTI_FORWARD_SCALE, 1.0/std::sqrt(fftScaling))) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    if ( (res=DftiSetValue( handle, DFTI_BACKWARD_SCALE, 1.0/std::sqrt(fftScaling))) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    if ( (res=DftiSetValue( handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE)) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    if ( n > 1 )
+      {
+        if ( (res=DftiSetValue( handle, DFTI_NUMBER_OF_TRANSFORMS, n)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+
+        if ( (res=DftiSetValue( handle, DFTI_INPUT_DISTANCE, (MKL_INT)fftScaling)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+
+        if ( (res=DftiSetValue( handle, DFTI_OUTPUT_DISTANCE, (MKL_INT)fftScaling)) != 0 )
+          {
+            GADGET_ERROR_MSG( DftiErrorMessage(res) );
+            return false;
+          }
+      }
+
+    if ( (res=DftiCommitDescriptor( handle)) != 0 )
+      {
+        GADGET_ERROR_MSG( DftiErrorMessage(res) );
+        return false;
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, bool forward)
+  {
+    size_t n = a.get_number_of_elements()/a.get_size(0);
+    MKL_LONG dim = a.get_size(0);
+
+    DFTI_DESCRIPTOR_HANDLE handle;
+
+    if ( typeid(T) == typeid(float) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandle(1, &dim, DFTI_SINGLE, n, handle));
+      }
+    else if ( typeid(T) == typeid(double) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandle(1, &dim, DFTI_DOUBLE, n, handle));
+      }
+    else
+      {
+        GADGET_ERROR_MSG("hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a), only float and double are supported ... ");
+        return false;
+      }
+
+    MKL_LONG res;
+
+    if ( forward )
+      {
+        if ( ( res=DftiComputeForward(handle, reinterpret_cast<T*>(a.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+    else
+      {
+        if ( ( res=DftiComputeBackward(handle, reinterpret_cast<T*>(a.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+
+    if ( ( res=DftiFreeDescriptor(&handle) ) != 0 ) 
+      { 
+        GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+        return false; 
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+  {
+    size_t n = a.get_number_of_elements()/a.get_size(0);
+    MKL_LONG dim = a.get_size(0);
+
+    DFTI_DESCRIPTOR_HANDLE handle;
+
+    if ( typeid(T) == typeid(float) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandleOutOfPlace(1, &dim, DFTI_SINGLE, n, handle));
+      }
+    else if ( typeid(T) == typeid(double) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandleOutOfPlace(1, &dim, DFTI_DOUBLE, n, handle));
+      }
+    else
+      {
+        GADGET_ERROR_MSG("hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r), only float and double are supported ... ");
+        return false;
+      }
+
+    MKL_LONG res;
+
+    if ( forward )
+      {
+        if ( ( res=DftiComputeForward( handle, reinterpret_cast<T*>(a.begin()), reinterpret_cast<T*>(r.begin()) ) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+    else
+      {
+        if ( ( res=DftiComputeBackward( handle, reinterpret_cast<T*>(a.begin()), reinterpret_cast<T*>(r.begin()) ) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+
+    if ( ( res=DftiFreeDescriptor(&handle) ) != 0 ) 
+      { 
+        GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+        return false; 
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a, bool forward)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+    MKL_LONG dim[2];
+    dim[0] = a.get_size(1);
+    dim[1] = a.get_size(0);
+
+    DFTI_DESCRIPTOR_HANDLE handle;
+
+    if ( typeid(T) == typeid(float) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandle(2, dim, DFTI_SINGLE, n, handle));
+      }
+    else if ( typeid(T) == typeid(double) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandle(2, dim, DFTI_DOUBLE, n, handle));
+      }
+    else
+      {
+        GADGET_ERROR_MSG("hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a), only float and double are supported ... ");
+        return false;
+      }
+
+    MKL_LONG res;
+    if ( forward )
+      {
+        if ( ( res=DftiComputeForward(handle, reinterpret_cast<T*>(a.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+    else
+      {
+        if ( ( res=DftiComputeBackward(handle, reinterpret_cast<T*>(a.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+
+    if ( ( res=DftiFreeDescriptor(&handle) ) != 0 ) 
+      { 
+        GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+        return false; 
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1));
+    MKL_LONG dim[2];
+    dim[0] = a.get_size(1);
+    dim[1] = a.get_size(0);
+
+    DFTI_DESCRIPTOR_HANDLE handle;
+
+    if ( typeid(T) == typeid(float) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandleOutOfPlace(2, dim, DFTI_SINGLE, n, handle));
+      }
+    else if ( typeid(T) == typeid(double) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandleOutOfPlace(2, dim, DFTI_DOUBLE, n, handle));
+      }
+    else
+      {
+        GADGET_ERROR_MSG("hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a), only float and double are supported ... ");
+        return false;
+      }
+
+    MKL_LONG res;
+    if ( forward )
+      {
+        if ( ( res=DftiComputeForward(handle, reinterpret_cast<T*>(a.begin()), reinterpret_cast<T*>(r.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+    else
+      {
+        if ( ( res=DftiComputeBackward(handle, reinterpret_cast<T*>(a.begin()), reinterpret_cast<T*>(r.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+
+    if ( ( res=DftiFreeDescriptor(&handle) ) != 0 ) 
+      { 
+        GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+        return false; 
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a, bool forward)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+
+    MKL_LONG dim[3];
+    dim[0] = a.get_size(2);
+    dim[1] = a.get_size(1);
+    dim[2] = a.get_size(0);
+
+    DFTI_DESCRIPTOR_HANDLE handle;
+
+    if ( typeid(T) == typeid(float) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandle(3, dim, DFTI_SINGLE, n, handle));
+      }
+    else if ( typeid(T) == typeid(double) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandle(3, dim, DFTI_DOUBLE, n, handle));
+      }
+    else
+      {
+        GADGET_ERROR_MSG("hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a), only float and double are supported ... ");
+        return false;
+      }
+
+    MKL_LONG res;
+    if ( forward )
+      {
+        if ( ( res=DftiComputeForward(handle, reinterpret_cast<T*>(a.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+    else
+      {
+        if ( ( res=DftiComputeBackward(handle, reinterpret_cast<T*>(a.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+
+    if ( ( res=DftiFreeDescriptor(&handle) ) != 0 ) 
+      { 
+        GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+        return false; 
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+  {
+    size_t n = a.get_number_of_elements()/(a.get_size(0)*a.get_size(1)*a.get_size(2));
+
+    MKL_LONG dim[3];
+    dim[0] = a.get_size(2);
+    dim[1] = a.get_size(1);
+    dim[2] = a.get_size(0);
+
+    DFTI_DESCRIPTOR_HANDLE handle;
+
+    if ( typeid(T) == typeid(float) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandleOutOfPlace(3, dim, DFTI_SINGLE, n, handle));
+      }
+    else if ( typeid(T) == typeid(double) )
+      {
+        GADGET_CHECK_RETURN_FALSE(configureFFTHandleOutOfPlace(3, dim, DFTI_DOUBLE, n, handle));
+      }
+    else
+      {
+        GADGET_ERROR_MSG("hoNDFFT<T>::fft3(a, r), only float and double are supported ... ");
+        return false;
+      }
+
+    MKL_LONG res;
+    if ( forward )
+      {
+        if ( ( res=DftiComputeForward(handle, reinterpret_cast<T*>(a.begin()), reinterpret_cast<T*>(r.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+    else
+      {
+        if ( ( res=DftiComputeBackward(handle, reinterpret_cast<T*>(a.begin()), reinterpret_cast<T*>(r.begin())) ) != 0 ) 
+          { 
+            GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+            return false; 
+          }
+      }
+
+    if ( ( res=DftiFreeDescriptor(&handle) ) != 0 ) 
+      { 
+        GADGET_ERROR_MSG( DftiErrorMessage(res) ); 
+        return false; 
+      }
+
+    return true;
+  }
+
+#else
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, bool forward)
+  {
+    if ( forward )
+      {
+        fft(&a, 1);
+      }
+    else
+      {
+        ifft(&a, 1);
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a, bool forward)
+  {
+    if ( forward )
+      {
+        fft(&a);
+      }
+    else
+      {
+        ifft(&a);
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a, bool forward)
+  {
+    if ( forward )
+      {
+        fft(&a);
+      }
+    else
+      {
+        ifft(&a);
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft1(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+  {
+    r = a;
+    if ( forward )
+      {
+        fft(&r, 1);
+      }
+    else
+      {
+        ifft(&r, 1);
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft2(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+  {
+    r = a;
+    if ( forward )
+      {
+        fft(&a);
+      }
+    else
+      {
+        ifft(&a);
+      }
+
+    return true;
+  }
+
+  template<typename T> 
+  bool hoNDFFT<T>::fft3(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward)
+  {
+    r = a;
+    if ( forward )
+      {
+        fft(&a);
+      }
+    else
+      {
+        ifft(&a);
+      }
+
+    return true;
+  }
+
+#endif // USE_MKL
+
+  // 
+  // Instantiation
+  //
+  
+  template class hoNDFFT<float>;
+  template class hoNDFFT<double>;
+}
diff --git a/toolboxes/core/cpu/hoNDFFT.h b/toolboxes/core/cpu/hoNDFFT.h
new file mode 100644
index 0000000..cd0a61d
--- /dev/null
+++ b/toolboxes/core/cpu/hoNDFFT.h
@@ -0,0 +1,222 @@
+/** \file hoNDFFT.h
+    \brief Wrappers for FFTW for ndarrays of type std::complex.
+*/
+
+#ifndef hoNDFFT_H
+#define hoNDFFT_H
+
+#include "hoNDArray.h"
+#include "cpucore_export.h"
+
+#include <boost/thread/mutex.hpp>
+#include <iostream>
+#include <fftw3.h>
+#include <complex>
+
+#ifdef USE_MKL
+    #include "mkl.h"
+#endif // USE_MKL
+
+namespace Gadgetron{
+
+    /** 
+    Generic class for Fast Fourier Transforms using FFTW on the hoNDArray class.
+    This class is a singleton because the planning and memory allocation routines of FFTW are NOT threadsafe.
+    The class' template type is a REAL, ie. float or double.
+
+    Access using e.g.
+    FFT<float>::instance()
+    */
+    template <typename T> class EXPORTCPUCORE hoNDFFT
+    {
+    public:
+
+        typedef std::complex<T> ComplexType;
+
+        static hoNDFFT<T>* instance(); 
+
+        void fft(hoNDArray< ComplexType >* input, unsigned int dim_to_transform)
+        {
+            //-1 refers to the sign of the transform, -1 for FFTW_FORWARD
+            fft_int(input,dim_to_transform,-1);
+        }
+
+        void ifft(hoNDArray< ComplexType >* input, unsigned int dim_to_transform)
+        {
+            //1 refers to the sign of the transform, +1 for FFTW_BACKWARD
+            fft_int(input,dim_to_transform,1);
+        }
+
+        void fft(hoNDArray< ComplexType >* input)
+        {
+            for (size_t i = 0; i < input->get_number_of_dimensions(); i++) {
+                //-1 refers to the sign of the transform, -1 for FFTW_FORWARD
+                fft_int(input,i,-1);
+            }
+        }
+
+        void ifft(hoNDArray< ComplexType >* input)
+        {
+            for (size_t i = 0; i < input->get_number_of_dimensions(); i++) {
+                //1 refers to the sign of the transform, +1 for FFTW_BACKWARD
+                fft_int(input,i,1);
+            }
+        }
+
+        // 1D
+        bool fftshift1D(hoNDArray< ComplexType >& a);
+        bool fftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        bool ifftshift1D(hoNDArray< ComplexType >& a);
+        bool ifftshift1D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // 2D
+        bool fftshift2D(hoNDArray< ComplexType >& a);
+        bool fftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        bool ifftshift2D(hoNDArray< ComplexType >& a);
+        bool ifftshift2D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // 3D
+        bool fftshift3D(hoNDArray< ComplexType >& a);
+        bool fftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        bool ifftshift3D(hoNDArray< ComplexType >& a);
+        bool ifftshift3D(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // 1D fft, in-place and out-of-place
+        // the first dimension will be transformed
+        bool fft1(hoNDArray< ComplexType >& a);
+        bool ifft1(hoNDArray< ComplexType >& a);
+
+        bool fft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        bool ifft1(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // centered 1D fft
+        bool fft1c(hoNDArray< ComplexType >& a);
+        bool ifft1c(hoNDArray< ComplexType >& a);
+
+        bool fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        bool ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        bool fft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+        bool ifft1c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+
+        // 2D fft, in-place and out-of-place
+        // the first and second dimensions will be transformed
+        bool fft2(hoNDArray< ComplexType >& a);
+        bool ifft2(hoNDArray< ComplexType >& a);
+
+        bool fft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        bool ifft2(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // centered 2D fft
+        bool fft2c(hoNDArray< ComplexType >& a);
+        bool ifft2c(hoNDArray< ComplexType >& a);
+
+        bool fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        bool ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        bool fft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+        bool ifft2c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+
+        // 3D fft, in-place and out-of-place
+        // the first, second and third dimensions will be transformed
+        bool fft3(hoNDArray< ComplexType >& a);
+        bool ifft3(hoNDArray< ComplexType >& a);
+
+        bool fft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        bool ifft3(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        // centered 3D fft
+        bool fft3c(hoNDArray< ComplexType >& a);
+        bool ifft3c(hoNDArray< ComplexType >& a);
+
+        bool fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+        bool ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r);
+
+        bool fft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+        bool ifft3c(const hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, hoNDArray< ComplexType >& buf);
+
+    protected:
+
+        //We are making these protected since this class is a singleton
+
+        hoNDFFT() {
+            set_function_pointers();
+        }
+
+        virtual ~hoNDFFT() { fftw_cleanup_ptr_(); }
+
+        void fft_int(hoNDArray< ComplexType >* input, size_t dim_to_transform, int sign);
+
+        void set_function_pointers();
+
+        int   (*fftw_import_wisdom_from_file_ptr_)(FILE*);
+        void  (*fftw_export_wisdom_to_file_ptr_)(FILE*);
+        void  (*fftw_cleanup_ptr_)(void);
+        void* (*fftw_malloc_ptr_)(size_t);
+        void  (*fftw_free_ptr_)(void* p);
+        void  (*fftw_execute_ptr_)(void*);
+        void* (*fftw_plan_dft_1d_ptr_)(int, void*, void*, int, unsigned);
+        void  (*fftw_destroy_plan_ptr_)(void*);
+
+        static hoNDFFT<T>* instance_;
+        boost::mutex mutex_;
+
+        // the fft and ifft shift pivot for a certain length
+        // [0 .. pivot-1] will be shifted to the right end
+        size_t fftshiftPivot(size_t len);
+        size_t ifftshiftPivot(size_t len);
+
+        // 1D
+        bool fftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot);
+        bool ifftshift1D(const ComplexType* a, ComplexType* r, size_t x, size_t pivot);
+
+        bool fftshiftPivot1D(ComplexType* a, size_t x, size_t n, size_t pivot);
+        bool fftshiftPivot1D(const ComplexType* a, ComplexType* r, size_t x, size_t n, size_t pivot);
+
+        // 2D
+        bool fftshiftPivot2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n, unsigned pivotx, unsigned pivoty);
+        bool fftshiftPivot2D(ComplexType* a, size_t x, size_t y, size_t n, unsigned pivotx, unsigned pivoty);
+
+        bool fftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n);
+        bool ifftshift2D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t n);
+
+        bool fftshift2D(ComplexType* a, size_t x, size_t y, size_t n);
+        bool ifftshift2D(ComplexType* a, size_t x, size_t y, size_t n);
+
+        // 3D
+        bool fftshiftPivot3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n, unsigned pivotx, unsigned pivoty, unsigned pivotz);
+        bool fftshiftPivot3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n, unsigned pivotx, unsigned pivoty, unsigned pivotz);
+
+        bool fftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n);
+        bool ifftshift3D(const ComplexType* a, ComplexType* r, size_t x, size_t y, size_t z, size_t n);
+
+        bool fftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n);
+        bool ifftshift3D(ComplexType* a, size_t x, size_t y, size_t z, size_t n);
+
+        // forward: true, fft; false, inverse fft
+        bool fft1(hoNDArray< ComplexType >& a, bool forward);
+        bool fft2(hoNDArray< ComplexType >& a, bool forward);
+        bool fft3(hoNDArray< ComplexType >& a, bool forward);
+
+        bool fft1(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward);
+        bool fft2(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward);
+        bool fft3(hoNDArray< ComplexType >& a, hoNDArray< ComplexType >& r, bool forward);
+
+        #ifdef USE_MKL
+
+        // configure the 1D/2D/3D MKL based fft handles
+        // x, y, z: the length of dimensions
+        // n: the number of transformation
+        // handle: the fft MKL handle
+        // fftPresion: DFTI_SINGLE or DFTI_DOUBLE
+        bool configureFFTHandle(long long NDim, MKL_LONG* dim, DFTI_CONFIG_VALUE fftPresion, size_t n, DFTI_DESCRIPTOR_HANDLE& handle);
+        bool configureFFTHandleOutOfPlace(long long NDim, MKL_LONG* dim, DFTI_CONFIG_VALUE fftPresion, size_t n, DFTI_DESCRIPTOR_HANDLE& handle);
+
+        #endif // USE_MKL
+    };
+}
+
+#endif //hoNDFFT_H
diff --git a/toolboxes/core/cpu/hostutils/CMakeLists.txt b/toolboxes/core/cpu/hostutils/CMakeLists.txt
new file mode 100644
index 0000000..c8800c4
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/CMakeLists.txt
@@ -0,0 +1,18 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_HOSTUTILS__)
+endif (WIN32)
+
+add_library(hostutils SHARED 
+  parameterparser.cpp
+  )
+
+install(TARGETS 
+  hostutils 
+  DESTINATION lib)
+
+install(FILES 
+  hostutils_export.h 
+  parameterparser.h 
+  url_encode.h 
+  FileInfo.h 
+  DESTINATION include)
diff --git a/toolboxes/core/cpu/hostutils/FileInfo.h b/toolboxes/core/cpu/hostutils/FileInfo.h
new file mode 100644
index 0000000..8ae7469
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/FileInfo.h
@@ -0,0 +1,54 @@
+#ifndef FILEINFO_H_
+#define FILEINFO_H_
+
+#include <string>
+#include <fstream>
+
+namespace Gadgetron {
+
+/**
+ *  Simple wrapper class for getting file info (file exists, file length, etc) before accessing file
+ *
+ */
+class FileInfo
+{
+public:
+
+	/**
+	 *   Constructor. After calling the constructor the file_exists_ flag will be set in the class
+	 */
+	FileInfo(std::string filename)
+	{
+		filename_ = filename;
+		std::ifstream ifile(filename_.c_str());
+		file_exists_ = ifile;
+	}
+
+	virtual ~FileInfo() {}
+
+	/**
+	 *  Does the file exist (can be opened)
+	 */
+	bool exists() {
+		return file_exists_;
+	}
+
+	size_t length() {
+		size_t length = 0;
+		if (file_exists_) {
+			std::ifstream ifile(filename_.c_str());
+			ifile.seekg(0,std::ios::end);
+			length = ifile.tellg();
+		} else {
+			return -1;
+		}
+		return length;
+	}
+
+protected:
+	bool file_exists_;
+	std::string filename_;
+};
+}
+
+#endif /* FILEINFO_H_ */
diff --git a/toolboxes/core/cpu/hostutils/hostutils_export.h b/toolboxes/core/cpu/hostutils/hostutils_export.h
new file mode 100644
index 0000000..2c732db
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/hostutils_export.h
@@ -0,0 +1,22 @@
+/*
+ * hostutils_export.h
+ *
+ *  Created on: Nov 18, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef HOSTUTILS_EXPORT_H_
+#define HOSTUTILS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_HOSTUTILS__) || defined (hostutils_EXPORTS)
+#define EXPORTHOSTUTILS __declspec(dllexport)
+#else
+#define EXPORTHOSTUTILS __declspec(dllimport)
+#endif
+#else
+#define EXPORTHOSTUTILS
+#endif
+
+
+#endif /* HOSTUTILS_EXPORT_H_ */
diff --git a/toolboxes/core/cpu/hostutils/parameterparser.cpp b/toolboxes/core/cpu/hostutils/parameterparser.cpp
new file mode 100644
index 0000000..883a768
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/parameterparser.cpp
@@ -0,0 +1,330 @@
+#include "parameterparser.h"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+
+namespace Gadgetron {
+
+  CommandLineParameter::CommandLineParameter(char com_switch,CommandLineParameterType type, unsigned int nr_values, const char* desc, bool required)
+  {
+    m_type = type;
+    m_switch = com_switch;
+    m_nr_values = nr_values;
+    m_desc = std::string(desc);
+    m_is_required = required;
+    m_is_set = false;
+
+    if (m_nr_values > 0){
+      m_int_value = new int[m_nr_values];
+      m_float_value = new float[m_nr_values];
+      m_string_value = new std::string[m_nr_values];
+    }
+    else{
+      m_int_value = 0;
+      m_float_value = 0;
+      m_string_value = 0;
+    }
+  }
+
+  CommandLineParameter::~CommandLineParameter()
+  {
+    if (m_int_value != 0) delete [] m_int_value;
+    if (m_float_value != 0) delete [] m_float_value;
+    if (m_string_value != 0) delete [] m_string_value;
+  } 
+
+  const char* CommandLineParameter::get_string_value(unsigned int i)
+  {
+    if (i < m_nr_values){
+      return m_string_value[i].c_str();
+    }
+    else{
+      return 0;
+    }
+  }
+
+  int CommandLineParameter::get_int_value(unsigned int i)
+  {
+    if (i < m_nr_values){
+      return m_int_value[i];
+    }
+    else{
+      return 0;
+    }
+  }
+
+  float CommandLineParameter::get_float_value(unsigned int i)
+  {
+    if (i < m_nr_values){
+      return m_float_value[i];
+    }
+    else{
+      return 0.0f;
+    }
+  }
+
+  bool CommandLineParameter::get_is_set()
+  {
+    return m_is_set;
+  }
+
+  bool CommandLineParameter::get_is_required()
+  {
+    return m_is_required;
+  }
+
+  bool CommandLineParameter::is_switch_equal_to(char com_switch)
+  {
+    return (m_switch == com_switch);
+  }
+
+  char** CommandLineParameter::set_value(char** argv)
+  {
+    int args = 0; 
+    for (unsigned int i = 0; i < m_nr_values;i++){
+      m_string_value[i] = std::string(argv[i]);
+      if (m_type == COMMAND_LINE_FLOAT || m_type == COMMAND_LINE_INT){
+        std::stringstream ss (std::stringstream::in | std::stringstream::out);
+        ss << m_string_value[i];
+        ss >> m_float_value[i];
+        m_int_value[i] = static_cast<int>(m_float_value[i]);
+      }
+      else{
+        m_int_value[i] = 1;
+        m_float_value[i] = 1.0f;
+      }
+      args++;
+    }
+    m_is_set = true;
+
+    return (argv+args);
+  }
+
+  int CommandLineParameter::get_number_of_values()
+  {
+    return m_nr_values;
+  }
+
+  char CommandLineParameter::get_switch()
+  {
+    return m_switch;
+  }
+
+  std::string CommandLineParameter::get_desc()
+  {
+    return m_desc;
+  }
+
+  ParameterParser::ParameterParser(int list_size, int list_increment)
+  {
+    m_list_size = list_size;
+    m_list_increment = list_increment;
+    m_parameter_list = new CommandLineParameter*[m_list_size];
+    m_number_of_parameters = 0;
+    m_max_desc_length = 0;
+    m_max_number_values = 0;
+  }
+
+  ParameterParser::~ParameterParser()
+  {
+    delete_list();
+  }
+
+  void ParameterParser::expand_list()
+  {
+    int new_list_size = m_list_size + m_list_increment;
+    CommandLineParameter **new_list = new CommandLineParameter*[new_list_size];
+
+    for (int i = 0; i < m_number_of_parameters; i++){
+      new_list[i] = m_parameter_list[i];
+    }
+
+    delete [] m_parameter_list;
+    m_parameter_list = new_list;
+  }
+
+  void ParameterParser::delete_list()
+  {
+    for (int i = 0; i < m_number_of_parameters; i++){
+      delete m_parameter_list[i];
+    }
+    delete [] m_parameter_list;
+  }
+
+  int ParameterParser::add_parameter(char com_switch,CommandLineParameterType type, unsigned int nr_values, 
+                                     const char* desc, bool required, const char* def)
+  {
+    char** argv = new char*[nr_values];
+    std::string *arg_list = new std::string[nr_values];
+
+    add_parameter(com_switch, type, nr_values, desc, required);
+
+    std::stringstream ss (std::stringstream::in | std::stringstream::out);
+    ss << def;
+
+    unsigned int args = 0; 
+    while (args < nr_values){
+      ss >> arg_list[args];
+      argv[args] = (char*)arg_list[args].c_str();
+      args++;
+    }
+
+    m_parameter_list[m_number_of_parameters-1]->set_value(argv);
+ 
+    delete [] argv;
+    delete [] arg_list;
+
+    return 0;
+  }
+
+  int ParameterParser::add_parameter(char com_switch,CommandLineParameterType type, unsigned int nr_values, const char* desc, bool required)
+  {
+    CommandLineParameter *p = new CommandLineParameter(com_switch, type, nr_values, desc, required);
+    for (int i = 0; i < m_number_of_parameters; i++){
+      if (m_parameter_list[i]->is_switch_equal_to(com_switch)){
+        std::cout << "ParameterParser: Attempt to parameter twice " << com_switch << std::endl;
+        delete p;
+        return -1;
+      }
+    }
+    if (m_number_of_parameters >= m_list_size) expand_list();
+    m_parameter_list[m_number_of_parameters++] = p;
+    if ((int)p->get_desc().length() > m_max_desc_length){
+      m_max_desc_length = p->get_desc().length();
+    }
+    if ((int)p->get_number_of_values() > m_max_number_values) {
+      m_max_number_values = p->get_number_of_values();
+    }
+    return 0;
+  }
+
+  int ParameterParser::parse_parameter_list(int argc, char** argv)
+  {
+    int a = 0;
+    int ret = 0;
+    m_command_name = std::string(argv[a++]);
+    bool argument_found;
+    while (a < argc){
+      if (argv[a][0] != '-'){
+        std::cout << "ParameterParser: malformed argument list" << std::endl;
+        print_usage();
+      }
+
+      argument_found = false;
+      for (int i = 0; i < m_number_of_parameters; i++){
+        if (m_parameter_list[i]->is_switch_equal_to(argv[a][1])){
+          if (m_parameter_list[i]->get_number_of_values() <= argc-a-1){
+            m_parameter_list[i]->set_value((argv+a+1));
+            a += m_parameter_list[i]->get_number_of_values()+1;
+            argument_found = true;
+            break;
+          }
+          else{
+            std::cout << std::endl << "ParameterParser: malformed argument list: -" << argv[a][1] << std::endl;
+            //print_usage();
+            argument_found = true;
+            a++;
+            while (a < argc && argv[a][0] != '-') a++;
+            ret = -1;
+            break;
+          }
+        }
+      }
+
+      if (!argument_found){
+        std::cout << std::endl << "ParameterParser: unknown argument: -" << argv[a][1] << std::endl;
+        //print_usage();
+        ret = -1;
+        a++;
+        while (a < argc && argv[a][0] != '-') a++;
+      }
+    }
+    return ret;
+  }
+ 
+  void ParameterParser::print_usage()
+  {
+    int space_fill = 0;
+
+    std::cout << "---------------------------------------------------- " << std::endl;
+    std::cout << "Usage: " << m_command_name << " -[";
+    for (int i = 0; i < m_number_of_parameters; i++){
+      std::cout << m_parameter_list[i]->get_switch();
+    }
+    std::cout << "]" << std::endl;
+ 
+    for (int i = 0; i < m_number_of_parameters; i++){
+        std::cout << " -" << m_parameter_list[i]->get_switch() << " ";
+        if (m_max_number_values > 1){
+          if (m_parameter_list[i]->get_number_of_values() > 1){
+            std::cout << m_parameter_list[i]->get_number_of_values() << "x "; 
+          }
+          else{
+            std::cout << " "; 
+          }
+        }
+        if (m_parameter_list[i]->get_number_of_values() > 0){
+          std::cout << "[" << m_parameter_list[i]->get_desc() << "]";
+          space_fill = (m_max_desc_length - m_parameter_list[i]->get_desc().length())+2;
+        }
+        else{
+          space_fill = m_max_desc_length+2+2;
+        }
+        std::cout << std::endl;
+      }
+    std::cout << "---------------------------------------------------- " << std::endl; 
+  }
+
+  void ParameterParser::print_parameter_list()
+  {
+    std::cout << "---------------------------------------------------- " << std::endl;
+    for (int i = 0; i < m_number_of_parameters; i++){
+      std::cout << " ";
+      std::cout << "(-" << m_parameter_list[i]->get_switch() << ") ";
+      std::cout << std::setw(m_max_desc_length+2) << std::setiosflags(std::ios::left);
+      std::cout << m_parameter_list[i]->get_desc() << ": ";
+      if (m_parameter_list[i]->get_is_set()){
+        for (int j = 0; j < m_parameter_list[i]->get_number_of_values(); j++){
+          std::cout << m_parameter_list[i]->get_string_value(j) << " ";
+          if (!m_parameter_list[i]->get_is_required()){
+            std::cout << "(optional)";
+          }
+        }
+      }
+      else{
+        if (m_parameter_list[i]->get_is_required()){
+          std::cout << "(missing)";
+        }
+        else{
+          std::cout << "N/A (optional)";
+        }
+      }
+      std::cout << std::endl;
+    }
+    std::cout << "---------------------------------------------------- " << std::endl;
+  }
+ 
+  bool ParameterParser::all_required_parameters_set()
+  {
+    for (int i = 0; i < m_number_of_parameters; i++){
+      if (!m_parameter_list[i]->get_is_set() && m_parameter_list[i]->get_is_required())
+        return false;
+    }
+    return true;
+  }
+
+  CommandLineParameter* ParameterParser::get_parameter(char com_switch)
+  {
+    for (int i = 0; i < m_number_of_parameters; i++){
+      if (m_parameter_list[i]->is_switch_equal_to(com_switch)){
+        return m_parameter_list[i];
+      }
+    }
+    std::stringstream ss;
+    ss << "Parameter " << com_switch << " is undefined";
+    throw std::runtime_error(ss.str());
+  }
+}
diff --git a/toolboxes/core/cpu/hostutils/parameterparser.h b/toolboxes/core/cpu/hostutils/parameterparser.h
new file mode 100644
index 0000000..7bb5e22
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/parameterparser.h
@@ -0,0 +1,81 @@
+#pragma once
+#include "hostutils_export.h"
+
+#include <vector>
+#include <string>
+
+namespace Gadgetron {
+
+  typedef enum 
+    {
+      COMMAND_LINE_STRING,
+      COMMAND_LINE_INT,
+      COMMAND_LINE_FLOAT,
+      COMMAND_LINE_NO_ARG
+    } CommandLineParameterType; 
+
+  class EXPORTHOSTUTILS CommandLineParameter
+  {
+  public:
+    CommandLineParameter(char com_switch, CommandLineParameterType type, unsigned int nr_values, const char* desc, bool required);
+    ~CommandLineParameter();
+
+    bool is_switch_equal_to(char com_switch);
+
+    char** set_value(char** argv);
+
+    int get_number_of_values();
+    char get_switch();
+
+    const char* get_string_value(unsigned int i = 0);
+    int get_int_value(unsigned int i = 0);
+    float get_float_value(unsigned int i = 0);
+
+    bool get_is_set();
+    bool get_is_required();
+    std::string get_desc();
+
+  private:
+    CommandLineParameterType  m_type;
+    char                      m_switch;
+    unsigned int              m_nr_values;
+    std::string               m_desc;
+    bool                      m_is_set;
+    bool                      m_is_required;
+    int                      *m_int_value;
+    std::string              *m_string_value;
+    float                    *m_float_value;
+  };
+
+  class EXPORTHOSTUTILS ParameterParser
+  {
+  public:
+    ParameterParser(int list_size = 10, int list_increment = 10);
+    ~ParameterParser();
+
+    int add_parameter(char com_switch,CommandLineParameterType type,  unsigned int nr_values, const char* desc, bool required);
+    int add_parameter(char com_switch,CommandLineParameterType type,  unsigned int nr_values, const char* desc, bool required, const char* def);
+
+    int parse_parameter_list(int argc, char** argv);
+
+    int get_number_of_parameters();
+
+    void print_usage();
+    void print_parameter_list();
+
+    bool all_required_parameters_set();
+
+    CommandLineParameter* get_parameter(char com_switch);
+
+  private:
+    CommandLineParameter** m_parameter_list;
+    int m_number_of_parameters;
+    int m_list_size;
+    int m_list_increment;
+    int m_max_desc_length;
+    int m_max_number_values;
+    std::string m_command_name;
+    void expand_list();
+    void delete_list();
+  };
+}
diff --git a/toolboxes/core/cpu/hostutils/url_encode.h b/toolboxes/core/cpu/hostutils/url_encode.h
new file mode 100644
index 0000000..99f9048
--- /dev/null
+++ b/toolboxes/core/cpu/hostutils/url_encode.h
@@ -0,0 +1,47 @@
+#ifndef URLENCODE_H
+#define URLENCODE_H
+
+#include <iostream>
+
+namespace Gadgetron {
+
+/**
+   Simple utility function for removing spaces and backslashes in URLs
+   This function is used in various places to ensure proper encoding of schemalocation URIs
+   
+*/
+inline std::string url_encode(const std::string& in) {
+	char* tmp = new char[in.size()*4]; //Leave us plenty of space
+	if (!tmp) {
+		std::cout << "Failed to allocate temporary space for string in url_encode" << std::endl;
+		return in;
+	}
+
+	char* optr = tmp;
+	char* iptr = (char*)in.c_str();
+
+	unsigned int counter = 0;
+	while (counter < in.size()) {
+		if (*iptr == ' ') {
+			*optr++ = '%';
+			*optr++ = '2';
+			*optr++ = '0';
+		} else if (*iptr == '\\') {
+			*optr++ = '/';
+		} else {
+			*optr++ = *iptr;
+		}
+		iptr++;
+		counter++;
+	}
+	*optr = '\0';
+
+	std::string ret(tmp);
+
+	delete [] tmp;
+
+	return ret;
+}
+}
+
+#endif //URLENCODE_H
diff --git a/toolboxes/core/gpu/CMakeLists.txt b/toolboxes/core/gpu/CMakeLists.txt
new file mode 100644
index 0000000..740091f
--- /dev/null
+++ b/toolboxes/core/gpu/CMakeLists.txt
@@ -0,0 +1,86 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUCORE__)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+find_package(CULA REQUIRED)
+
+include_directories( 
+  ${CUDA_INCLUDE_DIRS}
+  ${CULA_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+)
+
+cuda_add_library(gpucore SHARED 
+    check_CUDA.h
+    CUBLASContextProvider.h
+    cudaDeviceManager.h
+    cuNDArray.h
+    cuNDArray_blas.h
+    cuNDArray_elemwise.h
+    cuNDArray_operators.h
+    cuNDArray_utils.h
+    cuNDArray_reductions.h
+    cuNDFFT.h
+    cuNDFFT.cpp
+    GadgetronCuException.h
+    gpucore_export.h
+    GPUTimer.h
+    hoCuNDArray.h
+    hoCuNDArray_blas.h
+    hoCuNDArray_elemwise.h
+    hoCuNDArray_operators.h
+    hoCuNDArray_utils.h
+    radial_utilities.h
+    real_utilities_device.h
+    setup_grid.h
+    cuNDArray_operators.cu
+    cuNDArray_elemwise.cu
+    cuNDArray_blas.cu
+    cuNDArray_utils.cu
+    cuNDArray_reductions.cu
+    radial_utilities.cu
+    hoCuNDArray_blas.cu
+    CUBLASContextProvider.cpp
+    cudaDeviceManager.cpp
+  )
+
+target_link_libraries(gpucore cpucore 
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  ${CUDA_CUBLAS_LIBRARIES} 
+  ${CULA_LIBRARIES}
+  )
+
+install(TARGETS gpucore DESTINATION lib)
+
+install(FILES
+  gpucore_export.h
+  cuNDArray.h
+  cuNDArray_operators.h
+  cuNDArray_elemwise.h
+  cuNDArray_blas.h
+  cuNDArray_utils.h
+  cuNDArray_math.h
+  cuNDArray_reductions.h
+  hoCuNDArray.h
+  hoCuNDArray_blas.h
+  hoCuNDArray_operators.h
+  hoCuNDArray_elemwise.h
+  hoCuNDArray_utils.h
+  hoCuNDArray_math.h
+  GPUTimer.h				
+  cuNDFFT.h
+  GadgetronCuException.h
+  radial_utilities.h
+  real_utilities_device.h
+  check_CUDA.h
+  cudaDeviceManager.h
+  CUBLASContextProvider.h
+  setup_grid.h
+  DESTINATION include)
diff --git a/toolboxes/core/gpu/CUBLASContextProvider.cpp b/toolboxes/core/gpu/CUBLASContextProvider.cpp
new file mode 100644
index 0000000..4d1e38e
--- /dev/null
+++ b/toolboxes/core/gpu/CUBLASContextProvider.cpp
@@ -0,0 +1,113 @@
+/*
+ * CUBLASContextProvider.cpp
+ *
+ *  Created on: Mar 22, 2012
+ *      Author: Michael S. Hansen
+ */
+
+#include "CUBLASContextProvider.h"
+#include <cuda_runtime_api.h>
+#include <cula_lapack_device.h>
+
+
+CUBLASContextProvider* CUBLASContextProvider::instance()
+{
+		if (!instance_) instance_ = new CUBLASContextProvider();
+		return instance_;
+}
+
+CUBLASContextProvider::~CUBLASContextProvider()
+{
+	std::map<int, cublasHandle_t>::iterator it = handles_.begin();
+
+	while (it != handles_.end()) {
+		if (cudaSetDevice(it->first)!= cudaSuccess) {
+		    std::cerr << "Error: unable to set CUDA device." << std::endl;
+		}
+		culaShutdown();
+
+		cublasDestroy_v2(it->second);
+		it++;
+	}
+
+}
+
+cublasHandle_t* CUBLASContextProvider::getCublasHandle(int device_no)
+{
+	std::map<int, cublasHandle_t>::iterator it;
+
+
+	//Let's see if we have the handle already:
+	it = handles_.find(device_no);
+
+	if (it != handles_.end()) {
+		return &handles_[device_no];
+	}
+
+
+	//We don't have the handle yet, let's check if it makes sense to create one
+
+	int number_of_devices = 0;
+	if (cudaGetDeviceCount(&number_of_devices)!= cudaSuccess) {
+	    std::cerr << "Error: unable to query number of CUDA devices.\n" << std::endl;
+	    return 0;
+	}
+
+	if (number_of_devices == 0) {
+	      std::cerr << "Error: No available CUDA devices.\n" << std::endl;
+	      return 0;
+     }
+
+	  if (device_no >= number_of_devices) {
+	      std::cerr << "Requested device number exceeds number of devices." << std::endl;
+		  return 0;
+	  }
+
+	  //OK, so we are OK to create the handle. Before we do that, let's capture the current cuda device.
+
+	  int current_device_no;
+	if (cudaGetDevice(&current_device_no)!= cudaSuccess) {
+		 std::cerr << "Error: unable to get current CUDA device.\n" << std::endl;
+		      return 0;
+	}
+
+	if (current_device_no != device_no) {
+		//We must switch context
+		if (cudaSetDevice(device_no)!= cudaSuccess) {
+		    std::cerr << "Error: unable to set CUDA device." << std::endl;
+		      return 0;
+		}
+	}
+
+	cublasHandle_t handle; // this is a struct pointer
+
+	//std::cout << "*********   CREATING NEW CONTEXT ************" << std::endl;
+
+	if (cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS) {
+		std::cerr << "CUBLASContextProvider: unable to create cublas handle\n" << std::endl;
+		return 0;
+	}
+
+	handles_[device_no] = handle;
+
+	culaStatus s;
+	s = culaInitialize();
+	if(s != culaNoError) {
+		std::cerr << "CUBLASContextProvider: failed to initialize CULA" << std::endl;
+		return 0;
+	}
+
+	if (current_device_no != device_no) {
+		//We must switch context back
+		if (cudaSetDevice(current_device_no)!= cudaSuccess) {
+		   std::cerr << "Error: unable to set CUDA device.\n" << std::endl;
+		    return 0;
+		}
+	}
+
+	return &handles_[device_no];
+}
+
+
+CUBLASContextProvider* CUBLASContextProvider::instance_ = 0;
+
diff --git a/toolboxes/core/gpu/CUBLASContextProvider.h b/toolboxes/core/gpu/CUBLASContextProvider.h
new file mode 100644
index 0000000..27c62cf
--- /dev/null
+++ b/toolboxes/core/gpu/CUBLASContextProvider.h
@@ -0,0 +1,35 @@
+/*
+ * CUBLASContextProvider.h
+ *
+ *  Created on: Mar 22, 2012
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef CUBLASCONTEXTPROVIDER_H_
+#define CUBLASCONTEXTPROVIDER_H_
+#pragma once
+
+#include "gpucore_export.h"
+
+#include <cublas_v2.h>
+#include <map>
+#include <iostream>
+
+class EXPORTGPUCORE CUBLASContextProvider
+{
+
+public:
+	static CUBLASContextProvider* instance();
+
+	cublasHandle_t* getCublasHandle(int device_no = 0);
+
+private:
+	CUBLASContextProvider() {}
+	virtual ~CUBLASContextProvider();
+
+	static CUBLASContextProvider* instance_;
+
+	std::map<int, cublasHandle_t> handles_;
+};
+
+#endif /* CUBLASCONTEXTPROVIDER_H_ */
diff --git a/toolboxes/core/gpu/GPUTimer.h b/toolboxes/core/gpu/GPUTimer.h
new file mode 100644
index 0000000..16128cd
--- /dev/null
+++ b/toolboxes/core/gpu/GPUTimer.h
@@ -0,0 +1,75 @@
+/** file GPUTimer.h
+    Utility to measure Cuda performance. 
+*/
+
+#ifndef __GPUTIMER_H
+#define __GPUTIMER_H
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <cuda_runtime_api.h>
+
+namespace Gadgetron{
+
+  class GPUTimer
+  {
+  public:
+
+    GPUTimer() : name_("GPUTimer"), timing_in_destruction_(true)
+    {
+      start();
+    }
+
+    GPUTimer(bool timing) : name_("GPUTimer"), timing_in_destruction_(timing)
+    {
+      if ( timing_in_destruction_ )
+        {
+          start();
+        }
+    }
+
+    GPUTimer(const char* name) : name_(name), timing_in_destruction_(true)
+    {
+      start();
+    }
+
+    virtual ~GPUTimer() 
+    {
+      if ( timing_in_destruction_ )
+        {
+          stop();
+        }
+    }
+
+    virtual void start()
+    {
+      cudaEventCreate(&start_event_);
+      cudaEventCreate(&stop_event_);
+      cudaEventRecord( start_event_, 0 );
+    }
+
+    virtual void stop()
+    {
+      float time;
+      cudaEventRecord( stop_event_, 0 );
+      cudaEventSynchronize( stop_event_ );
+      cudaEventElapsedTime( &time, start_event_, stop_event_ );
+      cudaEventDestroy( start_event_ );
+      cudaEventDestroy( stop_event_ );
+
+      std::cout << name_ << ": " << time << " ms" << std::endl; std::cout.flush();
+    }
+
+    void set_timing_in_destruction(bool timing) { timing_in_destruction_ = timing; }
+
+    cudaEvent_t start_event_;
+    cudaEvent_t stop_event_;
+
+    std::string name_;
+    bool timing_in_destruction_;
+  };
+}
+
+#endif //__GPUTIMER_H
diff --git a/toolboxes/core/gpu/GadgetronCuException.h b/toolboxes/core/gpu/GadgetronCuException.h
new file mode 100644
index 0000000..dfa4cfa
--- /dev/null
+++ b/toolboxes/core/gpu/GadgetronCuException.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <stdexcept>
+
+namespace Gadgetron{
+  
+  class cuda_error : public std::runtime_error
+  {
+  public:
+    cuda_error(std::string msg) : std::runtime_error(msg) {}
+    cuda_error(cudaError_t errN) : std::runtime_error(cudaGetErrorString(errN)) {
+    }
+  };
+}
diff --git a/toolboxes/core/gpu/check_CUDA.h b/toolboxes/core/gpu/check_CUDA.h
new file mode 100644
index 0000000..d589804
--- /dev/null
+++ b/toolboxes/core/gpu/check_CUDA.h
@@ -0,0 +1,38 @@
+/** \file check_CUDA.h
+    \brief Macroes to check whether GPU-based code has caused any errors, and if so, throw a runtime exception accordingly.
+*/
+
+#pragma once
+
+#include "GadgetronCuException.h"
+
+namespace Gadgetron {
+
+  /**
+   *  Should never be used in the code, use CHECK_FOR_CUDA_ERROR(); instead
+   *  inspired by cutil.h: CUT_CHECK_ERROR
+   */
+  inline void CHECK_FOR_CUDA_ERROR(char const * cur_fun, const char* file, const int line) {
+    cudaError_t errorCode = cudaGetLastError();
+    if (errorCode != cudaSuccess) {
+      throw cuda_error(errorCode);
+    }
+#ifdef DEBUG
+    cudaThreadSynchronize();
+    errorCode = cudaGetLastError();
+    if (errorCode != cudaSuccess) {
+      throw cuda_error(errorCode);
+    }
+#endif
+  }
+}
+
+/**
+ *  Checks for CUDA errors and throws an exception if an error was detected.
+ */
+#define CHECK_FOR_CUDA_ERROR(); CHECK_FOR_CUDA_ERROR(BOOST_CURRENT_FUNCTION,__FILE__,__LINE__);
+
+/**
+ *  Call "res", checks for CUDA errors and throws an exception if an error was detected.
+ */
+#define CUDA_CALL(res) {cudaError_t errorCode = res; if (errorCode != cudaSuccess) { throw cuda_error(errorCode); }}
diff --git a/toolboxes/core/gpu/cuNDArray.h b/toolboxes/core/gpu/cuNDArray.h
new file mode 100644
index 0000000..143e4a1
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray.h
@@ -0,0 +1,701 @@
+/** \file cuNDArray.h
+    \brief GPU-based N-dimensional array (data container)
+*/
+
+#ifndef CUNDARRAY_H
+#define CUNDARRAY_H
+#pragma once
+
+#include "NDArray.h"
+#include "hoNDArray.h"
+#include "complext.h"
+#include "GadgetronCuException.h"
+#include "check_CUDA.h"
+
+#include <boost/shared_ptr.hpp>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <thrust/device_vector.h>
+
+namespace Gadgetron{
+
+  template <class T> class cuNDArray : public NDArray<T>
+  {
+
+  public:
+
+    // Constructors
+    //
+
+    cuNDArray() : NDArray<T>::NDArray() 
+    { 
+      cudaGetDevice(&this->device_); 
+    }
+
+    cuNDArray(const cuNDArray<T> &a) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      this->data_ = 0;
+      this->dimensions_ = a.get_dimensions();
+      allocate_memory();
+      if (a.device_ == this->device_) {
+        CUDA_CALL(cudaMemcpy(this->data_, a.data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice));
+      } else {
+        //This memory is on a different device, we must move it.
+        cudaSetDevice(a.device_);
+        boost::shared_ptr< hoNDArray<T> > tmp = a.to_host();
+        cudaSetDevice(this->device_);
+        cudaError_t err = cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice);
+        if (err !=cudaSuccess) {
+          deallocate_memory();
+          this->data_ = 0;
+          this->dimensions_->clear();
+          throw cuda_error(err);
+        }
+      }
+    }
+
+    cuNDArray(const cuNDArray<T> *a) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      this->data_ = 0;
+      this->dimensions_ = a->get_dimensions();
+      allocate_memory();
+      if (a->device_ == this->device_) {
+        CUDA_CALL(cudaMemcpy(this->data_, a->data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice));
+      } else {
+        //This memory is on a different device, we must move it.
+        cudaSetDevice(a->device_);
+        boost::shared_ptr< hoNDArray<T> > tmp = a->to_host();
+        cudaSetDevice(this->device_);
+        cudaError_t err = cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice);
+        if (err !=cudaSuccess) {
+          deallocate_memory();
+          this->data_ = 0;
+          this->dimensions_->clear();
+          throw cuda_error(err);
+        }
+      }
+    }
+
+    cuNDArray(const hoNDArray<T> &a) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      this->dimensions_ = a.get_dimensions();
+      allocate_memory();
+      if (cudaMemcpy(this->data_, a.get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+        deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_->clear();
+      }
+    }
+
+    cuNDArray(hoNDArray<T> *a) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      this->dimensions_ = a->get_dimensions();
+      allocate_memory();
+      if (cudaMemcpy(this->data_, a->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+        deallocate_memory();
+        this->data_ = 0;
+        this->dimensions_->clear();
+      }
+    }
+
+    cuNDArray(std::vector<size_t> *dimensions) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions);
+    }
+
+    cuNDArray(std::vector<size_t> *dimensions, int device_no) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions,device_no);
+    }
+
+    cuNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false) : NDArray<T>::NDArray()
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions,data,delete_data_on_destruct);
+    }
+
+    cuNDArray(std::vector<size_t> &dimensions) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions);
+    }
+
+    cuNDArray(std::vector<size_t> &dimensions, int device_no) : NDArray<T>::NDArray() 
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions,device_no);
+    }
+
+    cuNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false) : NDArray<T>::NDArray()
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions,data,delete_data_on_destruct);
+    }
+
+    cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions) : NDArray<T>::NDArray()
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions.get());
+    }
+
+    cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions, int device_no) : NDArray<T>::NDArray()
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions.get(),device_no);
+    }
+
+    cuNDArray(boost::shared_ptr<std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false) : NDArray<T>::NDArray()
+    {
+      cudaGetDevice(&this->device_);
+      create(dimensions.get(),data,delete_data_on_destruct);
+    }
+
+    cuNDArray(size_t len)
+    {
+      std::vector<size_t> dim(1);
+      dim[0] = len;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy)
+    {
+      std::vector<size_t> dim(2);
+      dim[0] = sx;
+      dim[1] = sy;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy, size_t sz)
+    {
+      std::vector<size_t> dim(3);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy, size_t sz, size_t st)
+    {
+      std::vector<size_t> dim(4);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp)
+    {
+      std::vector<size_t> dim(5);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq)
+    {
+      std::vector<size_t> dim(6);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      dim[5] = sq;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr)
+    {
+      std::vector<size_t> dim(7);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      dim[5] = sq;
+      dim[6] = sr;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    cuNDArray(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss)
+    {
+      std::vector<size_t> dim(8);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      dim[5] = sq;
+      dim[6] = sr;
+      dim[7] = ss;
+      cudaGetDevice(&this->device_);
+      create(dim);
+    }
+
+    // Destructor
+    virtual ~cuNDArray()
+    { 
+      if (this->delete_data_on_destruct_) 
+        deallocate_memory();  
+    }
+
+    // Assignment operator
+    cuNDArray<T>& operator=(const cuNDArray<T>& rhs)
+    {
+      int cur_device; 
+      CUDA_CALL(cudaGetDevice(&cur_device));
+      bool dimensions_match = this->dimensions_equal(&rhs);
+      if (dimensions_match && (rhs.device_ == cur_device) && (cur_device == this->device_)) {
+        CUDA_CALL(cudaMemcpy(this->data_, rhs.data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice));
+      }
+      else {
+        CUDA_CALL(cudaSetDevice(this->device_));
+        if( !dimensions_match ){
+          if(!this->delete_data_on_destruct_){
+            throw std::runtime_error("Array dimensions mismatch in cuNDArray::operator=. Cannot change dimensions of non-destructable array.");
+          }
+          deallocate_memory();
+          this->elements_ = rhs.elements_;
+          this->dimensions_ = rhs.get_dimensions();
+          allocate_memory();
+        }
+        if (this->device_ == rhs.device_) {
+          if (cudaMemcpy(this->data_, rhs.data_, this->elements_*sizeof(T), cudaMemcpyDeviceToDevice) !=cudaSuccess) {	    
+            cudaSetDevice(cur_device);
+            throw cuda_error("cuNDArray::operator=: failed to copy data (2)");
+          }
+        } else {
+          if( cudaSetDevice(rhs.device_) != cudaSuccess) {
+            cudaSetDevice(cur_device);
+            throw cuda_error("cuNDArray::operator=: unable to set device no (2)");
+          }
+          boost::shared_ptr< hoNDArray<T> > tmp = rhs.to_host();
+          if( cudaSetDevice(this->device_) != cudaSuccess) {
+            cudaSetDevice(cur_device);
+            throw cuda_error("cuNDArray::operator=: unable to set device no (3)");
+          }
+          if (cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+            cudaSetDevice(cur_device);
+            throw cuda_error("cuNDArray::operator=: failed to copy data (3)");
+          }
+        }
+        if( cudaSetDevice(cur_device) != cudaSuccess) {
+          throw cuda_error("cuNDArray::operator=: unable to restore to current device");
+        }
+      }
+      return *this;
+    }
+
+    cuNDArray<T>& operator=(const hoNDArray<T>& rhs)
+    {
+      int cur_device; 
+      CUDA_CALL(cudaGetDevice(&cur_device));
+      bool dimensions_match = this->dimensions_equal(&rhs);
+      if (dimensions_match && (cur_device == this->device_)) {
+        CUDA_CALL(cudaMemcpy(this->get_data_ptr(), rhs.get_data_ptr(), this->get_number_of_elements()*sizeof(T), cudaMemcpyHostToDevice));
+      }
+      else {
+        CUDA_CALL(cudaSetDevice(this->device_));
+        if( !dimensions_match ){
+          if (!this->delete_data_on_destruct_){
+            throw std::runtime_error("Array dimensions mismatch in cuNDArray::operator=. Cannot change dimensions of non-destructable array.");}          deallocate_memory();
+          this->elements_ = rhs.get_number_of_elements();
+          this->dimensions_ = rhs.get_dimensions();
+          allocate_memory();
+        }
+        if (cudaMemcpy(this->get_data_ptr(), rhs.get_data_ptr(), this->get_number_of_elements()*sizeof(T),
+                       cudaMemcpyHostToDevice) !=cudaSuccess) {
+          cudaSetDevice(cur_device);
+          throw cuda_error("cuNDArray::operator=: failed to copy data (1)");
+        }
+        if( cudaSetDevice(cur_device) != cudaSuccess) {
+          throw cuda_error("cuNDArray::operator=: unable to restore to current device");
+        }
+      }
+      return *this;
+    }
+
+    virtual void create(std::vector<size_t> *dimensions)
+    {
+      if ( this->dimensions_equal(dimensions) )
+        {
+          return;
+        }
+
+      return NDArray<T>::create(dimensions);
+    }
+
+    virtual void create(std::vector<size_t> *dimensions, int device_no)
+    {
+      if (device_no < 0){
+        throw cuda_error("cuNDArray::create: illegal device no");
+      }
+
+      if ( this->dimensions_equal(dimensions) && this->device_==device_no )
+        {
+          return;
+        }
+
+      this->device_ = device_no; 
+      NDArray<T>::create(dimensions);
+    }
+
+    virtual void create(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false)
+    {
+      if (!data) {
+        throw std::runtime_error("cuNDArray::create: 0x0 pointer provided");
+      }
+
+      int tmp_device; 
+      if( cudaGetDevice(&tmp_device) != cudaSuccess) {
+        throw cuda_error("cuNDArray::create: Unable to query for device");
+      }
+
+      cudaDeviceProp deviceProp; 
+      if( cudaGetDeviceProperties( &deviceProp, tmp_device) != cudaSuccess) {
+        throw cuda_error("cuNDArray::create: Unable to query device properties");
+      }
+
+      if (deviceProp.unifiedAddressing) {
+        cudaPointerAttributes attrib;
+        if (cudaPointerGetAttributes(&attrib, data) != cudaSuccess) {
+          CHECK_FOR_CUDA_ERROR();
+          throw cuda_error("cuNDArray::create: Unable to determine attributes of pointer");
+        }
+        this->device_ = attrib.device;
+      } else {
+        this->device_ = tmp_device;
+      }
+
+      NDArray<T>::create(dimensions, data, delete_data_on_destruct);
+    }
+
+    virtual void create(std::vector<size_t> &dimensions)
+    {
+      if ( this->dimensions_equal(&dimensions) )
+        {
+          return;
+        }
+
+      return NDArray<T>::create(dimensions);
+    }
+
+    virtual void create(std::vector<size_t> &dimensions, int device_no)
+    {
+      if (device_no < 0){
+        throw cuda_error("cuNDArray::create: illegal device no");
+      }
+
+      if ( this->dimensions_equal(&dimensions) && this->device_==device_no )
+        {
+          return;
+        }
+
+      this->device_ = device_no; 
+      NDArray<T>::create(dimensions);
+    }
+
+    virtual void create(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false)
+    {
+      if (!data) {
+        throw std::runtime_error("cuNDArray::create: 0x0 pointer provided");
+      }
+
+      int tmp_device; 
+      if( cudaGetDevice(&tmp_device) != cudaSuccess) {
+        throw cuda_error("cuNDArray::create: Unable to query for device");
+      }
+
+      cudaDeviceProp deviceProp;
+      if( cudaGetDeviceProperties( &deviceProp, tmp_device) != cudaSuccess) {
+        throw cuda_error("cuNDArray::create: Unable to query device properties");
+      }
+
+      if (deviceProp.unifiedAddressing) {
+        cudaPointerAttributes attrib;
+        if (cudaPointerGetAttributes(&attrib, data) != cudaSuccess) {
+          CHECK_FOR_CUDA_ERROR();
+          throw cuda_error("cuNDArray::create: Unable to determine attributes of pointer");
+        }
+        this->device_ = attrib.device;
+      } else {
+        this->device_ = tmp_device;
+      }
+
+      NDArray<T>::create(dimensions, data, delete_data_on_destruct);
+    }
+
+    virtual void create(boost::shared_ptr<std::vector<size_t> > dimensions){
+      this->create(dimensions.get());
+    }
+
+    virtual void create(boost::shared_ptr<std::vector<size_t> > dimensions, int device_no){
+      this->create(dimensions.get(),device_no);
+    }
+
+    virtual void create(boost::shared_ptr<std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false){
+      this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+    virtual void create(size_t len)
+    {
+      std::vector<size_t> dim(1);
+      dim[0] = len;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy)
+    {
+      std::vector<size_t> dim(2);
+      dim[0] = sx;
+      dim[1] = sy;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy, size_t sz)
+    {
+      std::vector<size_t> dim(3);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st)
+    {
+      std::vector<size_t> dim(4);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp)
+    {
+      std::vector<size_t> dim(5);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq)
+    {
+      std::vector<size_t> dim(6);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      dim[5] = sq;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr)
+    {
+      std::vector<size_t> dim(7);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      dim[5] = sq;
+      dim[6] = sr;
+      this->create(dim);
+    }
+
+    virtual void create(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss)
+    {
+      std::vector<size_t> dim(8);
+      dim[0] = sx;
+      dim[1] = sy;
+      dim[2] = sz;
+      dim[3] = st;
+      dim[4] = sp;
+      dim[5] = sq;
+      dim[6] = sr;
+      dim[7] = ss;
+      this->create(dim);
+    }
+
+    virtual boost::shared_ptr< hoNDArray<T> > to_host() const
+    {
+      boost::shared_ptr< hoNDArray<T> > ret(new hoNDArray<T>(this->dimensions_.get()));
+      if (cudaMemcpy(ret->get_data_ptr(), this->data_, this->elements_*sizeof(T), cudaMemcpyDeviceToHost) != cudaSuccess) {
+        throw cuda_error("cuNDArray::to_host(): failed to copy memory from device");
+      }
+
+      return ret;
+    }
+
+    virtual void to_host( hoNDArray<T> *out ) const 
+    {
+      if( !out ){
+        throw std::runtime_error("cuNDArray::to_host(): illegal array passed.");
+      }
+
+      if( out->get_number_of_elements() != this->get_number_of_elements() ){	
+        out->create( this->get_dimensions().get());
+      }
+
+      if( cudaMemcpy( out->get_data_ptr(), this->data_, this->elements_*sizeof(T), cudaMemcpyDeviceToHost) != cudaSuccess) {
+        throw cuda_error("cuNDArray::to_host(): failed to copy memory from device");
+      }
+    }
+
+    virtual void set_device(int device)
+    {
+      if( device_ == device )
+        return;
+
+      int cur_device;
+      if( cudaGetDevice(&cur_device) != cudaSuccess) {
+        throw cuda_error("cuNDArray::set_device: unable to get device no");
+      }
+
+      if( cur_device != device_ && cudaSetDevice(device_) != cudaSuccess) {
+        throw cuda_error("cuNDArray::set_device: unable to set device no");
+      }
+
+      boost::shared_ptr< hoNDArray<T> > tmp = to_host();
+      deallocate_memory();
+      if( cudaSetDevice(device) != cudaSuccess) {
+        cudaSetDevice(cur_device);
+        throw cuda_error("cuNDArray::set_device: unable to set device no (2)");
+      }
+
+      device_ = device;
+      allocate_memory();
+      if (cudaMemcpy(this->data_, tmp->get_data_ptr(), this->elements_*sizeof(T), cudaMemcpyHostToDevice) != cudaSuccess) {
+        cudaSetDevice(cur_device);
+        throw cuda_error("cuNDArray::set_device: failed to copy data");
+      }
+
+      if( cudaSetDevice(cur_device) != cudaSuccess) {
+        throw cuda_error("cuNDArray::set_device: unable to restore device to current device");
+      }
+    }
+
+    inline int get_device() { return device_; }
+
+    thrust::device_ptr<T> get_device_ptr(){
+      return thrust::device_ptr<T>(this->data_);
+    }
+
+    thrust::device_ptr<T> begin(){
+      return thrust::device_ptr<T>(this->data_);
+    }
+
+    thrust::device_ptr<T> end(){
+      return thrust::device_ptr<T>(this->data_)+this->get_number_of_elements();
+    }
+
+    T at( size_t idx ){
+      if( idx >= this->get_number_of_elements() ){
+        throw std::runtime_error("cuNDArray::at(): index out of range.");
+      }
+      T res;
+      CUDA_CALL(cudaMemcpy(&res, &this->get_data_ptr()[idx], sizeof(T), cudaMemcpyDeviceToHost));
+      return res;
+    }
+
+    T operator[]( size_t idx ){
+      if( idx >= this->get_number_of_elements() ){
+        throw std::runtime_error("cuNDArray::operator[]: index out of range.");
+      }
+      T res;
+      CUDA_CALL(cudaMemcpy(&res, &this->get_data_ptr()[idx], sizeof(T), cudaMemcpyDeviceToHost));
+      return res;
+    }
+
+  protected:
+
+    int device_; 
+
+    virtual void allocate_memory()
+    {
+      deallocate_memory();
+      this->elements_ = 1;
+
+      if (this->dimensions_->empty())
+        throw std::runtime_error("cuNDArray::allocate_memory() : dimensions is empty.");
+      
+      for (size_t i = 0; i < this->dimensions_->size(); i++) {
+        this->elements_ *= (*this->dimensions_)[i];
+      } 
+      
+      size_t size = this->elements_ * sizeof(T);
+
+      int device_no_old;
+      if (cudaGetDevice(&device_no_old) != cudaSuccess) {
+        throw cuda_error("cuNDArray::allocate_memory: unable to get device no");
+      }
+
+      if (device_ != device_no_old) {
+        if (cudaSetDevice(device_) != cudaSuccess) {
+          throw cuda_error("cuNDArray::allocate_memory: unable to set device no");
+        }
+      }
+
+      if (cudaMalloc((void**) &this->data_,size) != cudaSuccess) {
+        size_t free = 0, total = 0;
+        cudaMemGetInfo(&free, &total);
+        std::stringstream err("cuNDArray::allocate_memory() : Error allocating CUDA memory");
+        err << "CUDA Memory: " << free << " (" << total << ")";
+
+        err << "   memory requested: " << size << "( ";
+        for (size_t i = 0; i < this->dimensions_->size(); i++) {
+          std::cerr << (*this->dimensions_)[i] << " ";
+        }
+        err << ")";
+        this->data_ = 0;
+        throw std::runtime_error(err.str());
+      }
+
+      if (device_ != device_no_old) {
+        if (cudaSetDevice(device_no_old) != cudaSuccess) {
+          throw cuda_error("cuNDArray::allocate_memory: unable to restore device no");
+        }
+      }
+    }
+
+    virtual void deallocate_memory()
+    {
+      if (this->data_) {
+
+        int device_no_old;
+        CUDA_CALL(cudaGetDevice(&device_no_old));
+        if (device_ != device_no_old) {
+          CUDA_CALL(cudaSetDevice(device_));
+        }
+
+        CUDA_CALL(cudaFree(this->data_));
+        if (device_ != device_no_old) {
+          CUDA_CALL(cudaSetDevice(device_no_old));
+        }
+        this->data_ = 0;
+      }
+    }
+  };
+}
+
+#endif //CUNDARRAY_H
diff --git a/toolboxes/core/gpu/cuNDArray_blas.cu b/toolboxes/core/gpu/cuNDArray_blas.cu
new file mode 100644
index 0000000..c595265
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_blas.cu
@@ -0,0 +1,311 @@
+#include "cuNDArray_blas.h"
+#include "complext.h"
+#include "GadgetronCuException.h"
+#include "cudaDeviceManager.h"
+
+#include <cublas_v2.h>
+
+namespace Gadgetron{
+
+#define CUBLAS_CALL(fun) {cublasStatus_t err = fun; if (err != CUBLAS_STATUS_SUCCESS) {throw cuda_error(gadgetron_getCublasErrorString(err));}}
+
+  //NRM2
+  //
+
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_axpy(cublasHandle_t hndl, int n, const T* a , const T* x , int incx,  T* y, int incy);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_dot(cublasHandle_t, int, const T*, int, const  T*, int, T*, bool cc = true);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_nrm2(cublasHandle_t, int, const T*, int, typename realType<T>::Type *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amax(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amin(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_asum(cublasHandle_t handle, int n,const T *x, int incx, typename realType<T>::Type *result);
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<float>(cublasHandle_t hndl, int n, const float*  x, int inc, float* res){
+    return cublasSnrm2(hndl,n,x,inc,res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<double>(cublasHandle_t hndl, int n, const double*  x, int inc, double* res){
+    return cublasDnrm2(hndl,n,x,inc,res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<float_complext>(cublasHandle_t hndl, int n, const float_complext*  x, int inc, float* res){
+    return cublasScnrm2(hndl,n,(const cuComplex*)x,inc,res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_nrm2<double_complext>(cublasHandle_t hndl, int n, const double_complext*  x, int inc, double* res){
+    return cublasDznrm2(hndl,n,(const cuDoubleComplex*) x,inc,res);
+  }
+
+  //DOT
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<float>(cublasHandle_t hndl, int n , const float* x , int incx, const  float* y , int incy, float* res, bool cc){
+    return cublasSdot( hndl, n, x, incx, y, incy, res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<double>(cublasHandle_t hndl, int n , const double* x , int incx, const  double* y , int incy, double* res, bool cc){
+    return cublasDdot( hndl, n, x, incx, y, incy, res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<float_complext>(cublasHandle_t hndl, int n , const float_complext* x ,
+										int incx, const  float_complext* y , int incy, float_complext* res, bool cc){
+    if(cc)
+      return cublasCdotc( hndl, n, (const cuComplex*) x, incx, (const cuComplex*) y, incy, (cuComplex*) res);
+    else
+      return cublasCdotu( hndl, n, (const cuComplex*) x, incx, (const cuComplex*) y, incy, (cuComplex*) res);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_dot<double_complext>(cublasHandle_t hndl, int n , const double_complext* x ,
+										 int incx, const  double_complext* y , int incy, double_complext* res, bool cc){
+    if(cc)
+      return cublasZdotc( hndl, n, (const cuDoubleComplex*) x, incx, (const cuDoubleComplex*) y, incy, (cuDoubleComplex*) res);
+    else
+      return cublasZdotu( hndl, n, (const cuDoubleComplex*) x, incx, (const cuDoubleComplex*) y, incy, (cuDoubleComplex*) res);
+  }
+
+  // AXPY
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<float>(cublasHandle_t hndl , int n , const float* a , const float* x , int incx ,  float* y , int incy){
+    return cublasSaxpy(hndl,n,a,x,incx,y,incy);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<double>(cublasHandle_t hndl , int n , const double* a , const double* x , int incx ,  double* y , int incy){
+    return cublasDaxpy(hndl,n,a,x,incx,y,incy);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<float_complext>(cublasHandle_t hndl , int n , const float_complext* a , const float_complext* x , int incx ,  float_complext* y , int incy){
+    return cublasCaxpy(hndl,n,(const cuComplex*) a, (const cuComplex*) x,incx, (cuComplex*)y,incy);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_axpy<double_complext>(cublasHandle_t hndl , int n , const double_complext* a , const double_complext* x , int incx ,  double_complext* y , int incy){
+    return cublasZaxpy(hndl,n,(const cuDoubleComplex*) a, (const cuDoubleComplex*) x,incx, (cuDoubleComplex*)y,incy);
+  }
+
+  //SUM
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<float>(cublasHandle_t hndl, int n,const float *x, int incx, float *result){
+    return cublasSasum(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<double>(cublasHandle_t hndl, int n,const double *x, int incx, double *result){
+    return cublasDasum(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<float_complext>(cublasHandle_t hndl, int n,const float_complext *x, int incx, float *result){
+    return cublasScasum(hndl,n,(const cuComplex*) x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_asum<double_complext>(cublasHandle_t hndl, int n,const double_complext *x, int incx, double *result){
+    return cublasDzasum(hndl,n,(const cuDoubleComplex*) x,incx,result);
+  }
+
+  //AMIN
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<float>(cublasHandle_t hndl, int n,const float *x, int incx, int *result){
+    return cublasIsamin(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<double>(cublasHandle_t hndl, int n,const double *x, int incx, int *result){
+    return cublasIdamin(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<float_complext>(cublasHandle_t hndl, int n,const float_complext *x, int incx, int *result){
+    return cublasIcamin(hndl,n, (const cuComplex* ) x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amin<double_complext>(cublasHandle_t hndl, int n,const double_complext *x, int incx, int *result){
+    return cublasIzamin(hndl,n, (const cuDoubleComplex* ) x,incx,result);
+  }
+
+  //AMAX
+  //
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<float>(cublasHandle_t hndl, int n,const float *x, int incx, int *result){
+    return cublasIsamax(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<double>(cublasHandle_t hndl, int n,const double *x, int incx, int *result){
+    return cublasIdamax(hndl,n,x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<float_complext>(cublasHandle_t hndl, int n,const float_complext *x, int incx, int *result){
+    return cublasIcamax(hndl,n, (const cuComplex* ) x,incx,result);
+  }
+
+  template<> EXPORTGPUCORE cublasStatus_t cublas_amax<double_complext>(cublasHandle_t hndl, int n,const double_complext *x, int incx, int *result){
+    return cublasIzamax(hndl,n, (const cuDoubleComplex* ) x,incx,result);
+  }
+
+  template<class T> typename realType<T>::Type nrm2( cuNDArray<T> *arr )
+  {
+    if( arr == 0x0 )
+      throw std::runtime_error("Gadgetron::nrm2(): Invalid input array");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    typedef typename realType<T>::Type REAL;
+    REAL ret;
+
+    CUBLAS_CALL(cublas_nrm2<T>( cudaDeviceManager::Instance()->lockHandle(device), 
+                                (int)arr->get_number_of_elements(), arr->get_data_ptr(), 1, &ret));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+
+    return ret;
+  }
+
+  template<class T> T dot( cuNDArray<T> *arr1, cuNDArray<T> *arr2, bool cc )
+  {
+    if( arr1 == 0x0 || arr2 == 0x0 )
+      throw std::runtime_error("Gadgetron::dot(): Invalid input array");
+    
+    if( arr1->get_number_of_elements() != arr2->get_number_of_elements() )
+      throw std::runtime_error("Gadgetron::dot(): Array sizes mismatch");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    T ret;
+
+    CUBLAS_CALL(cublas_dot( cudaDeviceManager::Instance()->lockHandle(device), 
+                            (int)arr1->get_number_of_elements(), arr1->get_data_ptr(), 1, arr2->get_data_ptr(), 1, &ret, cc ));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+
+    return ret;
+  }
+
+  template<class T> void axpy( T a, cuNDArray<T> *x, cuNDArray<T> *y )
+  {
+    if( x == 0x0 || y == 0x0 )
+      throw std::runtime_error("Gadgetron::axpy(): Invalid input array");
+    
+    if( x->get_number_of_elements() != y->get_number_of_elements() )
+      throw std::runtime_error("Gadgetron::axpy(): Array sizes mismatch");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+
+    CUBLAS_CALL(cublas_axpy(cudaDeviceManager::Instance()->lockHandle(device), 
+                            (int)x->get_number_of_elements(), &a, x->get_data_ptr(), 1, y->get_data_ptr(), 1));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+  }
+
+  template<class T> void axpy( T a,  cuNDArray< complext<T> > *x, cuNDArray< complext<T> > *y )
+  {
+    axpy( complext<T>(a), x, y );
+  }
+
+  template<class T> typename realType<T>::Type asum(cuNDArray<T>* x)
+  {
+    if( x == 0x0 )
+      throw std::runtime_error("Gadgetron::asum(): Invalid input array");
+    
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    typename realType<T>::Type result;
+
+    CUBLAS_CALL(cublas_asum(cudaDeviceManager::Instance()->lockHandle(device),
+                            (int)x->get_number_of_elements(), x->get_data_ptr(), 1, &result));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+
+    return result;
+  }
+  
+  template<class T> size_t amin( cuNDArray<T>* x )
+  {
+    if( x == 0x0 )
+      throw std::runtime_error("Gadgetron::amin(): Invalid input array");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    int result;
+
+    CUBLAS_CALL(cublas_amin(cudaDeviceManager::Instance()->lockHandle(device),
+                            (int)x->get_number_of_elements(), x->get_data_ptr(), 1, &result));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+    
+    if( result > x->get_number_of_elements() ){
+      throw std::runtime_error("Gadgetron::amin(): computed index is out of bounds");
+    }
+    
+    return (size_t)result-1;
+  }
+  
+  template<class T> size_t amax(cuNDArray<T> *x )
+  {
+    if( x == 0x0 )
+      throw std::runtime_error("Gadgetron::amax(): Invalid input array");
+
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    int result;
+
+    CUBLAS_CALL(cublas_amax(cudaDeviceManager::Instance()->lockHandle(device),
+                            (int)x->get_number_of_elements(), x->get_data_ptr(), 1, &result));
+
+    cudaDeviceManager::Instance()->unlockHandle(device);
+    
+    if( result > x->get_number_of_elements() ){
+      throw std::runtime_error("Gadgetron::amax(): computed index is out of bounds");
+    }
+    
+    return (size_t)result-1;
+  }
+  
+  std::string gadgetron_getCublasErrorString(cublasStatus_t err)
+  {
+    switch (err){
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "NOT INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "ALLOC FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "INVALID VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "ARCH MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "MAPPING ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "EXECUTION FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "INTERNAL ERROR";      
+    case CUBLAS_STATUS_SUCCESS:
+      return "SUCCES";
+    default:
+      return "UNKNOWN CUBLAS ERROR";
+    }
+  }
+  
+  //
+  // Instantiation
+  //
+  
+  template EXPORTGPUCORE float dot(cuNDArray<float>*,cuNDArray<float>*,bool);
+  template EXPORTGPUCORE float nrm2(cuNDArray<float>*);
+  template EXPORTGPUCORE void axpy(float,cuNDArray<float>*,cuNDArray<float>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<float>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<float>*);
+  template EXPORTGPUCORE float asum(cuNDArray<float>*);
+
+  template EXPORTGPUCORE double dot(cuNDArray<double>*,cuNDArray<double>*,bool);
+  template EXPORTGPUCORE double nrm2(cuNDArray<double>*);
+  template EXPORTGPUCORE void axpy(double,cuNDArray<double>*,cuNDArray<double>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<double>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<double>*);
+  template EXPORTGPUCORE double asum(cuNDArray<double>*);
+
+  template EXPORTGPUCORE float_complext dot(cuNDArray<float_complext>*,cuNDArray<float_complext>*,bool);
+  template EXPORTGPUCORE float nrm2(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float_complext,cuNDArray<float_complext>*,cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float,cuNDArray<float_complext>*,cuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE float asum(cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE double_complext dot(cuNDArray<double_complext>*,cuNDArray<double_complext>*,bool);
+  template EXPORTGPUCORE double nrm2(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double_complext,cuNDArray<double_complext>*,cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double,cuNDArray<double_complext>*,cuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amin(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amax(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE double asum(cuNDArray<double_complext>*);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_blas.h b/toolboxes/core/gpu/cuNDArray_blas.h
new file mode 100644
index 0000000..ed2cd6f
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_blas.h
@@ -0,0 +1,49 @@
+/** \file cuNDArray_blas.h
+    \brief BLAS level-1 functions on the cuNDArray class.
+    
+    cuNDArray_blas.h provides BLAS level-1 functions on the cuNDArray class.
+    The cuNDArray is temporarily reshaped to a column vector for the respective operations.
+    The implementation is based on CUBLAS.
+    This code is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double, std::complex<float>, std::complex<double>, 
+    Gadgetron::complext<float>, and Gadgetron::complext<double>.
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "complext.h"
+#include "gpucore_export.h"
+
+#include <cublas_v2.h>
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE T dot( cuNDArray<T> *x, cuNDArray<T> *y, bool cc = true );
+
+  template<class T> EXPORTGPUCORE typename realType<T>::Type nrm2( cuNDArray<T> *x );
+
+  template<class T> EXPORTGPUCORE void axpy( T a, cuNDArray<T>* x, cuNDArray<T>* y );
+
+  template<class T> EXPORTGPUCORE void axpy( T a, cuNDArray<complext<T> > *x, cuNDArray<complext<T> > *y );
+  
+  /**
+   * @brief Gets the index of the index of the element with minimum absolute
+   * @param x Input data
+   * @return index of absolute minimum values
+   */
+  template<class T> EXPORTGPUCORE size_t amin( cuNDArray<T> *x );
+  
+  /**
+   * @brief Gets the index of the index of the element with maximum absolute
+   * @param x Input data
+   * @return index of absolute maximum values
+   * @details Note that this returns the C-style index and NOT the Fortran index.
+   */
+  template<class T> EXPORTGPUCORE size_t amax( cuNDArray<T> *x);
+  
+  template<class T> EXPORTGPUCORE typename realType<T>::Type asum( cuNDArray<T> *x );
+  
+  EXPORTGPUCORE std::string gadgetron_getCublasErrorString(cublasStatus_t err);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_elemwise.cu b/toolboxes/core/gpu/cuNDArray_elemwise.cu
new file mode 100644
index 0000000..9456f3e
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_elemwise.cu
@@ -0,0 +1,656 @@
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_blas.h"
+#include "complext.h"
+
+#include <complex>
+#include <thrust/functional.h>
+
+using namespace Gadgetron;
+using namespace std;
+
+template<typename T> struct cuNDA_abs : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename Gadgetron::realType<T>::Type operator()(const T &x) const {return abs(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::abs( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::abs(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_abs<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::abs_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::abs_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_abs<T>());
+}  
+  
+template<typename T> struct cuNDA_abs_square : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename Gadgetron::realType<T>::Type operator()(const T &x) const 
+  { 
+    typename realType<T>::Type tmp = abs(x);
+    return tmp*tmp;
+  }
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::abs_square( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::abs_square(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_abs_square<T>());
+  return result;
+}
+
+template<typename T> struct cuNDA_sqrt : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return sqrt(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > 
+Gadgetron::sqrt( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sqrt(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_sqrt<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::sqrt_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sqrt_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_sqrt<T>());
+}
+ 
+template<typename T> struct cuNDA_square : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return x*x;}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::square( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::square(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_square<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::square_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::square_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_square<T>());
+}  
+
+template<typename T> struct cuNDA_reciprocal : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return T(1)/x;}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::reciprocal( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_reciprocal<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::reciprocal_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_reciprocal<T>());
+}  
+ 
+template<typename T> struct cuNDA_reciprocal_sqrt : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return T(1)/sqrt(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::reciprocal_sqrt( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal_sqrt(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_reciprocal_sqrt<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::reciprocal_sqrt_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::reciprocal_sqrt_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_reciprocal_sqrt<T>());
+}  
+
+template<typename T> struct cuNDA_sgn : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return sgn(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > Gadgetron::sgn( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sgn(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_sgn<T>());
+  return result;
+}
+
+template<class T> void 
+Gadgetron::sgn_inplace( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::sgn_inplace(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_sgn<T>());
+}  
+ 
+template<typename T> struct cuNDA_real : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename realType<T>::Type operator()(const T &x) const {return real(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::real( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::real(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_real<T>());
+  return result;
+}
+
+template <typename T> struct cuNDA_imag : public thrust::unary_function<T,typename realType<T>::Type>
+{
+  __device__ typename realType<T>::Type operator()(const T &x) const {return imag(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<typename realType<T>::Type> > 
+Gadgetron::imag( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::imag(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<typename realType<T>::Type> > result(new cuNDArray<typename realType<T>::Type>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<typename realType<T>::Type> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_imag<T>());
+  return result;
+}
+
+template <typename T> struct cuNDA_conj : public thrust::unary_function<T,T>
+{
+  __device__ T operator()(const T &x) const {return conj(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > 
+Gadgetron::conj( cuNDArray<T> *x )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::conj(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_conj<T>());
+  return result;
+}
+
+template <typename T> struct cuNDA_real_to_complex : public thrust::unary_function<typename realType<T>::Type,T>
+{
+  __device__ T operator()(const typename realType<T>::Type &x) const {return T(x);}
+};
+
+template<class T> boost::shared_ptr< cuNDArray<T> > 
+Gadgetron::real_to_complex( cuNDArray<typename realType<T>::Type> *x )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::real_to_complex(): Invalid input array");
+   
+  boost::shared_ptr< cuNDArray<T> > result(new cuNDArray<T>());
+  result->create(x->get_dimensions());
+  thrust::device_ptr<T> resPtr = result->get_device_ptr();
+  thrust::device_ptr<typename realType<T>::Type> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),resPtr,cuNDA_real_to_complex<T>());
+  return result;
+}
+
+template<class T> void Gadgetron::clear( cuNDArray<T> *x )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clear(): Invalid input array");
+
+  if ( x->get_number_of_elements() > 0 )
+  {
+    cudaMemset(x->get_data_ptr(),0,sizeof(T)*x->get_number_of_elements());
+  }
+}
+
+template<class T> void 
+Gadgetron::fill( cuNDArray<T> *x, T val )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::fill_inplace(): Invalid input array");
+  
+  thrust::device_ptr<T> devPtr = x->get_device_ptr();
+  thrust::fill(devPtr,devPtr+x->get_number_of_elements(),val);
+}  
+
+template<typename T> struct cuNDA_clamp : public thrust::unary_function<T,T>
+{
+  cuNDA_clamp( T _min, T _max, T _min_val, T _max_val ) : min(_min), max(_max),min_val(_min_val), max_val(_max_val) {}
+  __device__ T operator()(const T &x) const 
+  {
+    if( x < min ) return min_val;
+    else if ( x >= max) return max_val;
+    else return x;
+  }
+  T min, max;
+  T min_val, max_val;
+};
+
+template<typename T> struct cuNDA_clamp< complext<T> > : public thrust::unary_function< complext<T>, complext<T> >
+{
+	cuNDA_clamp( T _min, T _max, complext<T> _min_val, complext<T> _max_val ) : min(_min), max(_max),min_val(_min_val), max_val(_max_val) {}
+  __device__ complext<T> operator()(const complext<T> &x) const 
+  {
+    if( real(x) < min ) return min_val;
+    else if ( real(x) >= max) return max_val;
+    else return complext<T>(real(x));
+  }
+  T min, max;
+  complext<T> min_val, max_val;
+};
+
+template<class T> void 
+Gadgetron::clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val)
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clamp(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_clamp<T>(min, max,min_val, max_val));
+}  
+
+template<class T> void 
+Gadgetron::clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max)
+{
+    clamp(x,min,max,T(min),T(max));
+}
+
+template<typename T> struct cuNDA_clamp_min : public thrust::unary_function<T,T>
+{
+  cuNDA_clamp_min( T _min ) : min(_min) {}
+  __device__ T operator()(const T &x) const 
+  {
+    if( x < min ) return min;
+    else return x;
+  }
+  T min;
+};
+
+template<typename T> struct cuNDA_clamp_min< complext<T> > : public thrust::unary_function< complext<T>, complext<T> >
+{
+  cuNDA_clamp_min( T _min ) : min(_min) {}
+  __device__ complext<T> operator()(const complext<T> &x) const 
+  {
+    if( real(x) < min ) return complext<T>(min);
+    else return complext<T>(real(x));
+  }
+  T min;
+};
+
+template<class T> void 
+Gadgetron::clamp_min( cuNDArray<T> *x, typename realType<T>::Type min )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clamp_min(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_clamp_min<T>(min));
+}  
+
+template<typename T> struct cuNDA_clamp_max : public thrust::unary_function<T,T>
+{
+  cuNDA_clamp_max( T _max ) : max(_max) {}
+  __device__ T operator()(const T &x) const 
+  {
+    if( x > max ) return max;
+    else return x;
+  }
+  T max;
+};
+
+template<typename T> struct cuNDA_clamp_max< complext<T> > : public thrust::unary_function< complext<T>, complext<T> >
+{
+  cuNDA_clamp_max( T _max ) : max(_max) {}
+  __device__ complext<T> operator()(const complext<T> &x) const 
+  {
+    if( real(x) > max ) return complext<T>(max);
+    else return complext<T>(real(x));
+  }
+  T max;
+};
+
+template<class T> void 
+Gadgetron::clamp_max( cuNDArray<T> *x, typename realType<T>::Type max )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::clamp_max(): Invalid input array");
+   
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),xPtr,cuNDA_clamp_max<T>(max));
+}  
+
+template<class T> void 
+Gadgetron::normalize( cuNDArray<T> *x, typename realType<T>::Type val )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::normalize(): Invalid input array");
+  
+  size_t max_idx = amax(x);
+  T max_val_before;
+  CUDA_CALL(cudaMemcpy(&max_val_before, &x->get_data_ptr()[max_idx], sizeof(T), cudaMemcpyDeviceToHost));
+  typename realType<T>::Type scale = val/abs(max_val_before);
+  *x *= scale;
+}
+
+
+template<typename T> struct cuNDA_shrink1 : public thrust::unary_function<T,T>
+{
+  cuNDA_shrink1( typename realType<T>::Type _gamma ) : gamma(_gamma) {}
+  __device__ T operator()(const T &x) const {
+    typename realType<T>::Type absX = abs(x);
+    T sgnX = (absX <= typename realType<T>::Type(0)) ? T(0) : x/absX;
+    return sgnX*max(absX-gamma, typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+};
+
+template<class T> void 
+Gadgetron::shrink1( cuNDArray<T> *x, typename realType<T>::Type gamma, cuNDArray<T> *out )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrink1(): Invalid input array");
+  
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),outPtr,cuNDA_shrink1<T>(gamma));
+}
+
+template<typename T> struct cuNDA_pshrink : public thrust::unary_function<T,T>
+{
+  cuNDA_pshrink( typename realType<T>::Type _gamma, typename realType<T>::Type _p ) : gamma(_gamma),p(_p) {}
+  __device__ T operator()(const T &x) const {
+    typename realType<T>::Type absX = abs(x);
+    T sgnX = (absX <= typename realType<T>::Type(0)) ? T(0) : x/absX;
+    return sgnX*max(absX-gamma*pow(absX,p-1), typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+  typename realType<T>::Type p;
+};
+
+template<class T> void
+Gadgetron::pshrink( cuNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrink1(): Invalid input array");
+
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),outPtr,cuNDA_pshrink<T>(gamma,p));
+}  
+
+template<typename T> struct cuNDA_shrinkd : public thrust::binary_function<T,typename realType<T>::Type,T>
+{
+  cuNDA_shrinkd( typename realType<T>::Type _gamma ) : gamma(_gamma) {}
+  __device__ T operator()(const T &x, const typename realType<T>::Type &s) const {
+  	T xs = (s <= typename realType<T>::Type(0)) ? T(0) : x/s;
+    return xs*max(s-gamma,typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+};
+
+template<class T> void 
+Gadgetron::shrinkd( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma, cuNDArray<T> *out )
+{ 
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrinkd(): Invalid input array");
+  
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::device_ptr<typename realType<T>::Type> sPtr = s->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),sPtr,outPtr,cuNDA_shrinkd<T>(gamma));
+}  
+
+
+template<typename T> struct cuNDA_pshrinkd : public thrust::binary_function<T,typename realType<T>::Type,T>
+{
+  cuNDA_pshrinkd( typename realType<T>::Type _gamma,typename realType<T>::Type _p ) : gamma(_gamma), p(_p) {}
+  __device__ T operator()(const T &x, const typename realType<T>::Type &s) const {
+    return x/s*max(s-gamma*pow(s,p-1),typename realType<T>::Type(0));
+  }
+  typename realType<T>::Type gamma;
+  typename realType<T>::Type p;
+};
+
+template<class T> void
+Gadgetron::pshrinkd( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out )
+{
+  if( x == 0x0 )
+    throw std::runtime_error("Gadgetron::shrinkd(): Invalid input array");
+
+  thrust::device_ptr<T> xPtr = x->get_device_ptr();
+  thrust::device_ptr<T> outPtr = (out == 0x0) ? x->get_device_ptr() : out->get_device_ptr();
+  thrust::device_ptr<typename realType<T>::Type> sPtr = s->get_device_ptr();
+  thrust::transform(xPtr,xPtr+x->get_number_of_elements(),sPtr,outPtr,cuNDA_pshrinkd<T>(gamma,p));
+}
+
+//
+// Instantiation
+//
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::abs_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs_square<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::sqrt<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::square<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::square_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::reciprocal<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::reciprocal_sqrt<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::sgn<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::clear<float>( cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::fill<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::clamp<float>( cuNDArray<float>*, float, float );
+template EXPORTGPUCORE void Gadgetron::clamp_min<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::clamp_max<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::normalize<float>( cuNDArray<float>*, float );
+template EXPORTGPUCORE void Gadgetron::shrink1<float>( cuNDArray<float>*, float, cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::pshrink<float>( cuNDArray<float>*, float,float, cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::shrinkd<float> ( cuNDArray<float>*, cuNDArray<float>*, float, cuNDArray<float>* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd<float> ( cuNDArray<float>*, cuNDArray<float>*, float,float, cuNDArray<float>* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::abs_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs_square<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::sqrt<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::square<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::square_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::reciprocal<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::reciprocal_sqrt<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::sgn<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::clear<double>( cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::fill<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::clamp<double>( cuNDArray<double>*, double, double );
+template EXPORTGPUCORE void Gadgetron::clamp_min<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::clamp_max<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::normalize<double>( cuNDArray<double>*, double );
+template EXPORTGPUCORE void Gadgetron::shrink1<double>( cuNDArray<double>*, double, cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::pshrink<double>( cuNDArray<double>*, double,double, cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::shrinkd<double> ( cuNDArray<double>*, cuNDArray<double>*, double, cuNDArray<double>* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd<double> ( cuNDArray<double>*, cuNDArray<double>*, double,double, cuNDArray<double>* );
+
+/*template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::sqrt< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs_square< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::square< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::reciprocal< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<float> > > Gadgetron::reciprocal_sqrt< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::clear< std::complex<float> >( cuNDArray< std::complex<float> >* );
+template EXPORTGPUCORE void Gadgetron::fill< std::complex<float> >( cuNDArray< std::complex<float> >*, std::complex<float> );
+template EXPORTGPUCORE void Gadgetron::normalize< std::complex<float> >( cuNDArray< std::complex<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::shrink1< std::complex<float> >( cuNDArray< std::complex<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::shrinkd< std::complex<float> > ( cuNDArray< std::complex<float> >*, cuNDArray<float>*, float );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::sqrt< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs_square< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::square< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::reciprocal< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< std::complex<double> > > Gadgetron::reciprocal_sqrt< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::clear< std::complex<double> >( cuNDArray< std::complex<double> >* );
+template EXPORTGPUCORE void Gadgetron::fill< std::complex<double> >( cuNDArray< std::complex<double> >*, std::complex<double> );
+template EXPORTGPUCORE void Gadgetron::normalize< std::complex<double> >( cuNDArray< std::complex<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::shrink1< std::complex<double> >( cuNDArray< std::complex<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::shrinkd< std::complex<double> > ( cuNDArray< std::complex<double> >*, cuNDArray<double>*, double );
+*/
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::sqrt< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::abs_square< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::square< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::reciprocal< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<float> > > Gadgetron::reciprocal_sqrt< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<complext<float> > > Gadgetron::sgn<complext<float> >( cuNDArray<complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<complext<float> >( cuNDArray<complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::clear< complext<float> >( cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::fill< complext<float> >( cuNDArray< complext<float> >*, complext<float> );
+template EXPORTGPUCORE void Gadgetron::clamp< complext<float> >( cuNDArray< complext<float> >*, float, float );
+template EXPORTGPUCORE void Gadgetron::clamp_min< complext<float> >( cuNDArray< complext<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::clamp_max< complext< float> >( cuNDArray<complext<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::normalize< complext<float> >( cuNDArray< complext<float> >*, float );
+template EXPORTGPUCORE void Gadgetron::shrink1< complext<float> >( cuNDArray< complext<float> >*, float, cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::pshrink< complext<float> >( cuNDArray< complext<float> >*, float,float, cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::shrinkd< complext<float> > ( cuNDArray< complext<float> >*, cuNDArray<float>*, float, cuNDArray< complext<float> >* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd< complext<float> > ( cuNDArray< complext<float> >*, cuNDArray<float>*, float,float, cuNDArray< complext<float> >* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::sqrt< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::abs_square< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::sqrt_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::square< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::square_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::reciprocal< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray< complext<double> > > Gadgetron::reciprocal_sqrt< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::reciprocal_sqrt_inplace< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<complext<double> > > Gadgetron::sgn<complext<double> >( cuNDArray<complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::sgn_inplace<complext<double> >( cuNDArray<complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::clear< complext<double> >( cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::fill< complext<double> >( cuNDArray< complext<double> >*, complext<double> );
+template EXPORTGPUCORE void Gadgetron::clamp< complext<double> >( cuNDArray< complext<double> >*, double, double );
+template EXPORTGPUCORE void Gadgetron::clamp_min< complext<double> >( cuNDArray< complext<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::clamp_max< complext<double> >( cuNDArray<complext<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::normalize< complext<double> >( cuNDArray< complext<double> >*, double );
+template EXPORTGPUCORE void Gadgetron::shrink1< complext<double> >( cuNDArray< complext<double> >*, double, cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::pshrink< complext<double> >( cuNDArray< complext<double> >*, double, double, cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::shrinkd< complext<double> > ( cuNDArray< complext<double> >*, cuNDArray<double>*, double, cuNDArray< complext<double> >* );
+template EXPORTGPUCORE void Gadgetron::pshrinkd< complext<double> > ( cuNDArray< complext<double> >*, cuNDArray<double>*, double,double, cuNDArray< complext<double> >* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::real<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::imag<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::conj<float>( cuNDArray<float>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::real<float_complext>( cuNDArray<float_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > Gadgetron::imag<float_complext>( cuNDArray<float_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > Gadgetron::conj<float_complext>( cuNDArray<float_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > Gadgetron::real_to_complex<float_complext>( cuNDArray<float>* );
+
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::real<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::imag<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::conj<double>( cuNDArray<double>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::real<double_complext>( cuNDArray<double_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > Gadgetron::imag<double_complext>( cuNDArray<double_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > Gadgetron::conj<double_complext>( cuNDArray<double_complext>* );
+template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > Gadgetron::real_to_complex<double_complext>( cuNDArray<double>* );
diff --git a/toolboxes/core/gpu/cuNDArray_elemwise.h b/toolboxes/core/gpu/cuNDArray_elemwise.h
new file mode 100644
index 0000000..772b408
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_elemwise.h
@@ -0,0 +1,241 @@
+/** \file cuNDArray_elemwise.h
+    \brief Element-wise math operations on the cuNDArray class.
+    
+    cuNDArray_elementwise.h defines element-wise array operations on the cuNDArray class.
+    Many of the provided functions come in two flavours:
+    1) A function that returns a smart pointer to a new array holding the result of the element-wise operation, and
+    2) A function that perform in-place element-wise computation replacing the input array.
+    When both versions are available the in-place version is suffixed _inplace.
+    Some functions (clear, fill, clamp, clamp_min, clamp_max, normalize, shrink1, shrinkd) are only provided as in-place operations,
+    and they do not carry the _inplace suffix in order to keep user code compact.
+    A few functions return a different type as its input array 
+    (abs on complex data, real, imag, real_to_std_complex, real_to_complext) and consequently is not offered as an in place operation.
+    The functions provided in cuNDArray_elemwise are deliberatly placed outside the NDArray derived classes
+    - to allow the NDArray classes to be lightweight header only data containers for both the cpu and gpu instances
+    - to allow for external library optimized implementations of the element-wise functions without adding such dependencies to the core data container
+    The present cpu implementation is based on Thrust.
+    The implementation is purposely split into a header and underlying implementation (.cpp) 
+    as this allows specific instantiation of the supported template types.     
+    The supported types are float, double Gadgetron::complext<float> and Gadgetron::complext<double> -- with some deliberate omissions.
+    Arrays of type std::complex<float> and std::complex<double> are currently not supported since the thrust device functors cannot 
+    link to std:: functions (as they are not declared as __device__). 
+    However, arrays of type std::complex are binary compatible with arrays of type Gadgetron::complext (for which we have support)
+    and can safely be cast to such.
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  /**
+   * @brief Calculates the element-wise absolute values (l2 norm) of the array entries
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise absolute values of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > abs( cuNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise absolute values (l2 norm) of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void abs_inplace( cuNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise squared absolute values of the array entries
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise absolute values of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > abs_square( cuNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise sqrt of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise sqrt of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > sqrt( cuNDArray<T> *x );
+
+  /**
+   * @brief Calculates the element-wise sqrt of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void sqrt_inplace( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise square of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise square of the input.
+   *
+   * For real numbers this functions is equivalent to square. 
+   * For complex arrays abs_square() and square() differ however.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > square( cuNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise square of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void square_inplace( cuNDArray<T> *x );
+    
+  /**
+   * @brief Calculates the element-wise reciprocal of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise reciprocal of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > reciprocal( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void reciprocal_inplace( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal sqrt of the array entries.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise reciprocal sqrt of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > reciprocal_sqrt( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the element-wise reciprocal sqrt of the array entries (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void reciprocal_sqrt_inplace( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the elementwise signum function on the array.
+   * @param[in] x Input array.
+   * @return A new array containing the element-wise sgn of the input.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > sgn( cuNDArray<T> *x );
+  
+  /**
+   * @brief Calculates the elementwise signum function on the array (in place).
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void sgn_inplace( cuNDArray<T> *x );
+
+  /**
+   * @brief Extract the real component from a complex array.
+   * @param[in] x Input array.
+   * @return A new array of the real component of the complex array.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > real( cuNDArray<T> *x );
+
+  /**
+   * @brief Extract the imaginary component from a complex array.
+   * @param[in] x Input array.
+   * @return A new array of the imaginary component of the complex array.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<typename realType<T>::Type> > imag( cuNDArray<T> *x );
+
+  /**
+   * @brief Create a new array of the complex conjugate of the input array. For real arrays a copy of the input array is return.
+   * @param[in] x Input array.
+   * @return A new array of the complex conjugate of the input array.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > conj( cuNDArray<T> *x );
+
+  /**
+   * @brief Construct a complex array from a real array.
+   * @param[in] x Input array.
+   * @return A new complex array containing the input array in the real component and zeros in the imaginary component.
+   */
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> > real_to_complex( cuNDArray<typename realType<T>::Type> *x );
+  
+  //
+  // From hereon the functions are all in-place although without the _inplace suffix...
+  //
+
+  /**
+   * @brief Clears the array to all zeros ( in place). Faster than fill.
+   * @param[in,out] x Input and output array.
+   */
+  template<class T> EXPORTGPUCORE void clear( cuNDArray<T> *x );
+
+  /**
+   * @brief Fills the array with a user provided constant value (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] val Fill value.
+   */
+  template<class T> EXPORTGPUCORE void fill( cuNDArray<T> *x, T val );
+
+  /**
+   * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min minimum value.
+   * @param[in] max maximum value.
+   * @param[in] min_val value to which everything below the minimum will be set
+   * @param[in] max_val value to which everything above the maximum will be set
+   */
+  template<class T> EXPORTGPUCORE void clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max, T min_val, T max_val );
+
+  /**
+   * @brief Clamps all values in the array to the minimum and maximum values specified (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min minimum value.
+   * @param[in] max maximum value.
+   */
+  template<class T> EXPORTGPUCORE void clamp( cuNDArray<T> *x, typename realType<T>::Type min, typename realType<T>::Type max);
+
+  /**
+   * @brief Clamps all values in the array to a minimum value allowed (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] min Minimum value.
+   */
+  template<class T> EXPORTGPUCORE void clamp_min( cuNDArray<T> *x, typename realType<T>::Type min );
+
+  /**
+   * @brief Clamps all values in the array to a maximum value allowed (in place).
+   * @param[in,out] x Input and output array.
+   * @param[in] max Maximum value.
+   */
+  template<class T> EXPORTGPUCORE void clamp_max( cuNDArray<T> *x, typename realType<T>::Type max );
+
+  /**
+   * @brief In place normalization (scaling) to a new maximum absolute array value val.
+   * @param[in,out] x Input and output array.
+   * @param[in] val New maximum absolute array value (according to the l2-norm)
+   */  
+  template<class T> EXPORTGPUCORE void normalize( cuNDArray<T> *x, typename realType<T>::Type val = typename realType<T>::Type(1) );
+
+  /**
+   * @brief In place shrinkage (soft thresholding), i.e. shrink(x,gamma) = x/abs(x)*max(abs(x)-gamma,0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   */  
+  template<class T> EXPORTGPUCORE void shrink1( cuNDArray<T> *x, typename realType<T>::Type gamma, cuNDArray<T> *out = 0x0 );
+
+
+
+  /**
+   * @brief In place p-shrinkage (soft thresholding), i.e. pshrink(x,gamma,p) = x/abs(x)*max(abs(x)-gamma*abs(x)^(p-1),0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] gamma Shrinkage control parameter
+   * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+   */
+  template<class T> EXPORTGPUCORE void pshrink( cuNDArray<T> *x, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out = 0x0 );
+
+  /**
+   * @brief In place shrinkage (soft thresholding, multi-dimensional), i.e. shrink(x,gamma,s) = x/s*max(s-gamma,0).
+   * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+   * @param[in,out] x Input array (and output array if out == 0x0).
+   * @param[in] s Input array, normalization.
+   * @param[in] gamma Shrinkage control parameter
+   */  
+  template<class T> EXPORTGPUCORE void shrinkd ( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma, cuNDArray<T> *out = 0x0 );
+
+  /**
+     * @brief In place p-shrinkage (soft thresholding, multi-dimensional), i.e. pshrink(x,s,gamma,p) = x/s*max(s-gamma*s^(p-1),0).
+     * @param[out] out Output array. Can be 0x0 in which case an in place transform is performed.
+     * @param[in,out] x Input array (and output array if out == 0x0).
+     * @param[in] gamma Shrinkage control parameter
+     * @param[in] p p value of the shrinkage. Should be less than 1 and more than 0.
+     */
+    template<class T> EXPORTGPUCORE void pshrinkd ( cuNDArray<T> *x, cuNDArray<typename realType<T>::Type> *s, typename realType<T>::Type gamma,typename realType<T>::Type p, cuNDArray<T> *out = 0x0 );
+}
diff --git a/toolboxes/core/gpu/cuNDArray_kernels.cu b/toolboxes/core/gpu/cuNDArray_kernels.cu
new file mode 100644
index 0000000..30330ad
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_kernels.cu
@@ -0,0 +1,179 @@
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include <sstream>
+
+namespace Gadgetron{
+
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int>* in,
+				 cuNDArray<int>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int2>* in,
+				 cuNDArray<int2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int3>* in,
+				 cuNDArray<int3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<int4>* in,
+				 cuNDArray<int4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<unsigned int>* in,
+				 cuNDArray<unsigned int>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint2>* in,
+				 cuNDArray<uint2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint3>* in,
+				 cuNDArray<uint3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint4>* in,
+				 cuNDArray<uint4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float>* in,
+				 cuNDArray<float>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float2>* in,
+				 cuNDArray<float2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float3>* in,
+				 cuNDArray<float3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float4>* in,
+				 cuNDArray<float4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double>* in,
+				 cuNDArray<double>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double2>* in,
+				 cuNDArray<double2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double3>* in,
+				 cuNDArray<double3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double4>* in,
+				 cuNDArray<double4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd1>* in,
+				 cuNDArray<intd1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd2>* in,
+				 cuNDArray<intd2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd3>* in,
+				 cuNDArray<intd3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<intd4>* in,
+				 cuNDArray<intd4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d1>* in,
+				 cuNDArray<uint64d1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d2>* in,
+				 cuNDArray<uint64d2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d3>* in,
+				 cuNDArray<uint64d3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<uint64d4>* in,
+				 cuNDArray<uint64d4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd1>* in,
+				 cuNDArray<floatd1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd2>* in,
+				 cuNDArray<floatd2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd3>* in,
+				 cuNDArray<floatd3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<floatd4>* in,
+				 cuNDArray<floatd4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled1>* in,
+				 cuNDArray<doubled1>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled2>* in,
+				 cuNDArray<doubled2>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled3>* in,
+				 cuNDArray<doubled3>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<doubled4>* in,
+				 cuNDArray<doubled4>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+				   
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<float_complext>* in,
+				 cuNDArray<float_complext>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+
+template EXPORTGPUCORE void cuNDArray_permute<>(cuNDArray<double_complext>* in,
+				 cuNDArray<double_complext>* out,
+				 std::vector<unsigned int> *order,
+				 int shift_mode);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_math.h b/toolboxes/core/gpu/cuNDArray_math.h
new file mode 100644
index 0000000..3141f5f
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_math.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cuNDArray_utils.h"
diff --git a/toolboxes/core/gpu/cuNDArray_operators.cu b/toolboxes/core/gpu/cuNDArray_operators.cu
new file mode 100644
index 0000000..b84cf28
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_operators.cu
@@ -0,0 +1,238 @@
+#include "cuNDArray_operators.h"
+#include "complext.h"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <complex>
+
+namespace Gadgetron{
+
+  // Private utility to verify array dimensions. 
+  // It "replaces" NDArray::dimensions_equal() to support batch mode.
+  // There is an identical function for all array instances (currently hoNDArray, cuNDArray, hoCuNDAraay)
+  // !!! Remember to fix any bugs in all versions !!!
+  //
+  template<class T,class S> static bool compatible_dimensions( const cuNDArray<T> &x, const cuNDArray<S> &y )
+  {
+    return ((x.get_number_of_elements()%y.get_number_of_elements())==0);
+  }
+
+  template<typename T>
+  class cuNDA_modulus : public thrust::unary_function<T,T>
+  {
+  public:
+    cuNDA_modulus(int x):mod(x) {};
+    __host__ __device__ T operator()(const T &y) const {return y%mod;}
+  private:
+    const int mod;
+  };
+
+  //
+  // This transform support batch mode when the number of elements in x is a multiple of the number of elements in y
+  //
+  template<class T,class S,class F>  
+  void equals_transform(cuNDArray<T> &x, cuNDArray<S> &y){
+    if (x.dimensions_equal(&y)){
+      thrust::transform(x.begin(), x.end(), y.begin(), x.begin(), F());
+    } else if (compatible_dimensions(x,y))
+      {
+        typedef thrust::transform_iterator<cuNDA_modulus<int>,thrust::counting_iterator<int>, int> transform_it;
+        transform_it indices = thrust::make_transform_iterator(thrust::make_counting_iterator(0),cuNDA_modulus<int>(y.get_number_of_elements()));
+        thrust::permutation_iterator<thrust::device_ptr<S>,transform_it> p = thrust::make_permutation_iterator(y.begin(),indices);
+        thrust::transform(x.begin(),x.end(),p,x.begin(),F());
+      } else {
+      throw std::runtime_error("The provided cuNDArrays have incompatible dimensions for operator {+=,-=,*=,/=}");
+    }
+  }
+
+  template<typename T>
+  struct cuNDA_plus : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x+y;}
+  };
+
+  template<typename T>
+  struct cuNDA_minus : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x-y;}
+  };
+
+  template<typename T>
+  struct cuNDA_multiply : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x*y;}
+  };
+
+  template<typename T>
+  struct cuNDA_divide : public thrust::binary_function<complext<T>, T, complext<T> >
+  {
+    __device__ complext<T> operator()(const complext<T> &x, const T &y) const {return x/y;}
+  };
+
+  template<class T> cuNDArray<T>& operator+= (cuNDArray<T> &x, cuNDArray<T> &y){
+    equals_transform< T,T,thrust::plus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator+= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::plus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator+= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_plus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator+= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_plus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator-= (cuNDArray<T> & x , cuNDArray<T> & y){
+    equals_transform< T,T,thrust::minus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator-= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::minus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator-= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_minus<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator-= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_minus<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator*= (cuNDArray<T> &x , cuNDArray<T> &y){
+    equals_transform< T,T,thrust::multiplies<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator*= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::multiplies<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator*= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_multiply<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator*= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_multiply<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator/= (cuNDArray<T> &x , cuNDArray<T> &y){
+    equals_transform< T,T,thrust::divides<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray<T>& operator/= (cuNDArray<T> &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), thrust::divides<T>());
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator/= (cuNDArray< complext<T> > &x , cuNDArray<T> &y){
+    equals_transform< complext<T>,T,cuNDA_divide<T> >(x,y);
+    return x;
+  }
+
+  template<class T> cuNDArray< complext<T> >& operator/= (cuNDArray<complext<T> > &x , T y){
+    thrust::constant_iterator<T> iter(y);
+    thrust::transform(x.begin(), x.end(), iter, x.begin(), cuNDA_divide<T>());
+    return x;
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE cuNDArray<float>& operator+=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator+=<float>(cuNDArray<float>&, float);
+  template EXPORTGPUCORE cuNDArray<float>& operator-=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator-=<float>(cuNDArray<float>&, float);
+  template EXPORTGPUCORE cuNDArray<float>& operator*=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator*=<float>(cuNDArray<float>&, float);
+  template EXPORTGPUCORE cuNDArray<float>& operator/=<float>(cuNDArray<float>&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray<float>& operator/=<float>(cuNDArray<float>&, float);
+
+  template EXPORTGPUCORE cuNDArray<double>& operator+=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator+=<double>(cuNDArray<double>&, double);
+  template EXPORTGPUCORE cuNDArray<double>& operator-=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator-=<double>(cuNDArray<double>&, double);
+  template EXPORTGPUCORE cuNDArray<double>& operator*=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator*=<double>(cuNDArray<double>&, double);
+  template EXPORTGPUCORE cuNDArray<double>& operator/=<double>(cuNDArray<double>&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray<double>& operator/=<double>(cuNDArray<double>&, double);
+
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=< complext<float> > 
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=< complext<float> > 
+  (cuNDArray< complext<float> >&, complext<float>);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=< complext<float> > 
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=< complext<float> > 
+  (cuNDArray< complext<float> >&, complext<float>);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=< complext<float> >
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=< complext<float> >
+  (cuNDArray< complext<float> >&, complext<float>);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=< complext<float> > 
+  (cuNDArray< complext<float> >&, cuNDArray< complext<float> >&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=< complext<float> > 
+  (cuNDArray< complext<float> >&, complext<float>);
+
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=<float>(cuNDArray< complext<float> >&, cuNDArray<float>&);
+
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator+=<float>(cuNDArray< complext<float> >&, float);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator-=<float>(cuNDArray< complext<float> >&, float);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator*=<float>(cuNDArray< complext<float> >&, float);
+  template EXPORTGPUCORE cuNDArray< complext<float> >& operator/=<float>(cuNDArray< complext<float> >&, float);
+
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=< complext<double> > 
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=< complext<double> > 
+  (cuNDArray< complext<double> >&, complext<double>);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=< complext<double> > 
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=< complext<double> > 
+  (cuNDArray< complext<double> >&, complext<double>);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=< complext<double> >
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=< complext<double> >
+  (cuNDArray< complext<double> >&, complext<double>);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=< complext<double> > 
+  (cuNDArray< complext<double> >&, cuNDArray< complext<double> >&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=< complext<double> > 
+  (cuNDArray< complext<double> >&, complext<double>);
+
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=<double>(cuNDArray< complext<double> >&, cuNDArray<double>&);
+
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator+=<double>(cuNDArray< complext<double> >&, double);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator-=<double>(cuNDArray< complext<double> >&, double);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator*=<double>(cuNDArray< complext<double> >&, double);
+  template EXPORTGPUCORE cuNDArray< complext<double> >& operator/=<double>(cuNDArray< complext<double> >&, double);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_operators.h b/toolboxes/core/gpu/cuNDArray_operators.h
new file mode 100644
index 0000000..356c1a0
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_operators.h
@@ -0,0 +1,167 @@
+/** \file cuNDArray_operators.h
+    \brief Common element-wise arithmetic operators on the cuNDArray class.
+    
+    cuNDArray_operators.h defines element-wise arithmetic array operations on the cuNDArray class.
+    We define the common operators +=, -=, *= and \= for both array-array and array-constant operations.
+    We have deliberately omitted to define operator+, operator- etc. since this would require returning an cuNDArray,
+    in turn invoking an explicit memcpy by the assignment operator.
+    Batch mode functionality is provided.
+    The implementation is based on Thrust.
+    This code is purposely split into a header and underlying implementation (.cu) 
+    as this allows specific instantiation of the supported template types. 
+    Furthermore thrust code can only be compiled by nvcc.
+    The supported types are float, double, Gadgetron::complext<float> and Gadgetron::complext<double>. 
+    Scalars can be applied to complex numbers of corresponding precision.
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron {
+
+  /**
+   * @brief Implementation of element-wise operator+= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator+= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator+= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator+= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator+= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator+= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator+= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator+= (cuNDArray<complext<T> > &x, T y );
+
+  /**
+   * @brief Implementation of element-wise operator-= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator-= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator-= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator-= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator-= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator-= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator-= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator-= (cuNDArray<complext<T> > &x, T y );
+
+  /**
+   * @brief Implementation of element-wise operator*= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator*= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator*= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator*= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator*= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator*= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator*= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator*= (cuNDArray<complext<T> > &x, T y );
+
+  /**
+   * @brief Implementation of element-wise operator/= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator/= (cuNDArray<T> &x, cuNDArray<T> &y);
+  
+  /**
+   * @brief Implementation of element-wise operator/= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray<T>& operator/= (cuNDArray<T> &x, T y );
+    
+  /**
+   * @brief Implementation of element-wise operator/= on two cuNDArrays.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input array.
+   
+   * Let y be an n-dimensional array. 
+   * Then the sizes of the first n array dimensions must match between x and y.
+   * If x contains further dimensions the operator is batched across those dimensions.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator/= (cuNDArray<complext<T> > &x, cuNDArray<T> &y);
+
+  /**
+   * @brief Implementation of element-wise operator/= on a cuNDArray with a scalar value.
+   * @param[in,out] x Input and output array.
+   * @param[in] y Input scalar.
+   */
+  template<class T> EXPORTGPUCORE cuNDArray< complext<T> >& operator/= (cuNDArray<complext<T> > &x, T y );
+}
diff --git a/toolboxes/core/gpu/cuNDArray_reductions.cu b/toolboxes/core/gpu/cuNDArray_reductions.cu
new file mode 100644
index 0000000..50158fa
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_reductions.cu
@@ -0,0 +1,102 @@
+#include "cuNDArray_reductions.h"
+#include "setup_grid.h"
+
+namespace Gadgetron {
+
+  template<class T> static void 
+  find_stride( cuNDArray<T> *in, size_t dim, size_t *stride, std::vector<size_t> *dims )
+  {
+    *stride = 1;
+    for( unsigned int i=0; i<in->get_number_of_dimensions(); i++ ){
+      if( i != dim )
+        dims->push_back(in->get_size(i));
+      if( i < dim )
+        *stride *= in->get_size(i);
+    }
+  }
+  
+  // Sum
+  //
+  template<class T> 
+  __global__ void sum_kernel( T *in, T *out, 
+                              unsigned int stride, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    if( idx < number_of_elements ){
+
+      unsigned int in_idx = (idx/stride)*stride*number_of_batches+(idx%stride);
+
+      T val = in[in_idx];
+
+      for( unsigned int i=1; i<number_of_batches; i++ ) 
+        val += in[i*stride+in_idx];
+
+      out[idx] = val; 
+    }
+  }
+
+  // Sum
+  //
+  template<class T>  boost::shared_ptr< cuNDArray<T> > sum( cuNDArray<T> *in, unsigned int dim )
+  {
+    // Some validity checks
+    if( !(in->get_number_of_dimensions()>1) ){
+      throw std::runtime_error("sum: underdimensioned.");;
+    }
+
+    if( dim > in->get_number_of_dimensions()-1 ){
+      throw std::runtime_error( "sum: dimension out of range.");;
+    }
+
+    unsigned int number_of_batches = in->get_size(dim);
+    unsigned int number_of_elements = in->get_number_of_elements()/number_of_batches;
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( number_of_elements, &blockDim, &gridDim );
+
+    // Find element stride
+    size_t stride; std::vector<size_t> dims;
+    find_stride<T>( in, dim, &stride, &dims );
+
+    // Invoke kernel
+    boost::shared_ptr< cuNDArray<T> > out(new cuNDArray<T>());
+    out->create(&dims);
+
+    sum_kernel<T><<< gridDim, blockDim >>>( in->get_data_ptr(), out->get_data_ptr(), stride, number_of_batches, number_of_elements );
+
+    CHECK_FOR_CUDA_ERROR();
+    return out;
+  }
+
+  template<class T> T mean(cuNDArray<T>* in)
+  {
+    return thrust::reduce(in->begin(),in->end(),T(0),thrust::plus<T>())/T(in->get_number_of_elements());
+  }
+
+  template<class T> T min(cuNDArray<T>* in)
+	{
+  	return *thrust::min_element(in->begin(),in->end());
+	}
+
+  template<class T> T max(cuNDArray<T>* in)
+	{
+		return *thrust::max_element(in->begin(),in->end());
+	}
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > sum<float>( cuNDArray<float>*, unsigned int);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > sum<double>( cuNDArray<double>*, unsigned int);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > sum<float_complext>( cuNDArray<float_complext>*, unsigned int);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > sum<double_complext>( cuNDArray<double_complext>*, unsigned int);  
+
+  template EXPORTGPUCORE float mean<float>(cuNDArray<float>*);
+  template EXPORTGPUCORE float_complext mean<float_complext>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE double mean<double>(cuNDArray<double>*);
+  template EXPORTGPUCORE double_complext mean<double_complext>(cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE float min<float>(cuNDArray<float>*);
+  template EXPORTGPUCORE float max<float>(cuNDArray<float>*);
+  template EXPORTGPUCORE double min<double>(cuNDArray<double>*);
+	template EXPORTGPUCORE double max<double>(cuNDArray<double>*);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_reductions.h b/toolboxes/core/gpu/cuNDArray_reductions.h
new file mode 100644
index 0000000..6d9867b
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_reductions.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE boost::shared_ptr<cuNDArray<T> > sum(cuNDArray<T> *data, unsigned int dim );
+  
+  template<class T> EXPORTGPUCORE T mean(cuNDArray<T>* data);
+  
+  template<class T> EXPORTGPUCORE T min(cuNDArray<T>* data);
+
+  template<class T> EXPORTGPUCORE T max(cuNDArray<T>* data);
+}
diff --git a/toolboxes/core/gpu/cuNDArray_utils.cu b/toolboxes/core/gpu/cuNDArray_utils.cu
new file mode 100644
index 0000000..735dafb
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_utils.cu
@@ -0,0 +1,936 @@
+#include "cuNDArray_utils.h"
+#include "vector_td_utilities.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+
+#include <math_functions.h>
+#include <cmath>
+
+namespace Gadgetron {
+
+  template <class T> 
+  __global__ void cuNDArray_permute_kernel( T* in, T* out, 
+                                            unsigned int ndim,
+                                            unsigned int* dims,
+                                            unsigned int* strides_out,
+                                            unsigned int elements,
+                                            int shift_mode)
+  {
+    unsigned int idx_in = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    unsigned int idx_out = 0;
+    unsigned int idx_in_tmp = idx_in;
+
+    if (idx_in < elements) {
+
+      unsigned int cur_index;
+      for (unsigned int i = 0; i < ndim; i++) {
+        unsigned int idx_in_remainder = idx_in_tmp / dims[i];
+        cur_index = idx_in_tmp-(idx_in_remainder*dims[i]); //cur_index = idx_in_tmp%dims[i];
+        if (shift_mode < 0) { //IFFTSHIFT
+          idx_out += ((cur_index+(dims[i]>>1))%dims[i])*strides_out[i];
+        } else if (shift_mode > 0) { //FFTSHIFT
+          idx_out += ((cur_index+((dims[i]+1)>>1))%dims[i])*strides_out[i];
+        } else {
+          idx_out += cur_index*strides_out[i];
+        }
+        idx_in_tmp = idx_in_remainder;
+      }
+      out[idx_in] = in[idx_out];
+    }
+  }
+
+  template <class T> void cuNDArray_permute( cuNDArray<T>* in,
+                                             cuNDArray<T>* out,
+                                             std::vector<size_t> *order,
+                                             int shift_mode)
+  {    
+    if( out == 0x0 ){
+      throw cuda_error("cuNDArray_permute(internal): 0x0 output");;
+    }
+
+    cudaError_t err;
+
+    T* in_ptr = in->get_data_ptr();
+    T* out_ptr = 0;
+
+    if (out) {
+      out_ptr = out->get_data_ptr();
+    } else {
+      if (cudaMalloc((void**) &out_ptr, in->get_number_of_elements()*sizeof(T)) != cudaSuccess) {
+        throw cuda_error("cuNDArray_permute : Error allocating CUDA memory");;
+      }
+    }
+
+    unsigned int* dims        = new unsigned int[in->get_number_of_dimensions()];
+    unsigned int* strides_out = new unsigned int[in->get_number_of_dimensions()];
+
+    if (!dims || !strides_out) {
+      throw cuda_error("cuNDArray_permute: failed to allocate temporary storage for arrays");;
+    }
+
+    for (unsigned int i = 0; i < in->get_number_of_dimensions(); i++) {
+      dims[i] = (*in->get_dimensions())[(*order)[i]];
+      strides_out[i] = 1;    
+      for (unsigned int j = 0; j < (*order)[i]; j++) {
+        strides_out[i] *= (*in->get_dimensions())[j];
+      }
+    }
+
+    unsigned int* dims_dev        = 0;
+    unsigned int* strides_out_dev = 0;
+
+    if (cudaMalloc((void**) &dims_dev, in->get_number_of_dimensions()*sizeof(unsigned int)) != cudaSuccess) {
+      throw cuda_error("cuNDArray_permute : Error allocating CUDA dims memory");;
+    }
+
+    if (cudaMalloc((void**) &strides_out_dev, in->get_number_of_dimensions()*sizeof(unsigned int)) != cudaSuccess) {
+      throw cuda_error("cuNDArray_permute : Error allocating CUDA strides_out memory");;
+    }
+
+    if (cudaMemcpy(dims_dev, dims, in->get_number_of_dimensions()*sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess) {
+      err = cudaGetLastError();
+      std::stringstream ss;
+      ss << "cuNDArray_permute : Error uploading dimensions to device, " << cudaGetErrorString(err);
+      throw cuda_error(ss.str());;
+    }
+
+    if (cudaMemcpy(strides_out_dev, strides_out, in->get_number_of_dimensions()*sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess) {
+      throw cuda_error("cuNDArray_permute : Error uploading strides to device");;
+    }
+
+    dim3 blockDim(512,1,1);
+    dim3 gridDim;
+    if( in->get_number_of_dimensions() > 2 ){
+      gridDim = dim3((unsigned int) std::ceil((double)in->get_size(0)*in->get_size(1)/blockDim.x), 1, 1 );
+      for( unsigned int d=2; d<in->get_number_of_dimensions(); d++ )
+        gridDim.y *= in->get_size(d);
+    }
+    else
+      gridDim = dim3((unsigned int) std::ceil((double)in->get_number_of_elements()/blockDim.x), 1, 1 );
+
+    cuNDArray_permute_kernel<<< gridDim, blockDim >>>( in_ptr, out_ptr, in->get_number_of_dimensions(), 
+                                                       dims_dev, strides_out_dev, in->get_number_of_elements(), shift_mode);
+
+    err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::stringstream ss;
+      ss <<"cuNDArray_permute : Error during kernel call: " << cudaGetErrorString(err);
+      throw cuda_error(ss.str());;
+    }
+
+    if (cudaFree(dims_dev) != cudaSuccess) {
+      err = cudaGetLastError();
+      std::stringstream ss;
+      ss << "cuNDArray_permute: failed to delete device memory (dims_dev) " << cudaGetErrorString(err);
+      throw cuda_error(ss.str());;
+    }
+
+    if (cudaFree(strides_out_dev) != cudaSuccess) {
+      err = cudaGetLastError();
+      std::stringstream ss;
+      ss << "cuNDArray_permute: failed to delete device memory (strides_out_dev) "<< cudaGetErrorString(err);
+      throw cuda_error(ss.str());;
+    }    
+    delete [] dims;
+    delete [] strides_out;    
+  }  
+
+  template <class T> boost::shared_ptr< cuNDArray<T> >
+  permute( cuNDArray<T> *in, std::vector<size_t> *dim_order, int shift_mode )
+  {
+    if( in == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");;
+    }    
+
+    std::vector<size_t> dims;
+    for (unsigned int i = 0; i < dim_order->size(); i++)
+      dims.push_back(in->get_dimensions()->at(dim_order->at(i)));
+    boost::shared_ptr< cuNDArray<T> > out( new cuNDArray<T>() );    
+    out->create(&dims);
+    permute( in, out.get(), dim_order, shift_mode );
+    return out;
+  }
+
+  template <class T> void
+  permute( cuNDArray<T> *in, cuNDArray<T> *out, std::vector<size_t> *dim_order, int shift_mode )
+  {
+    if( in == 0x0 || out == 0x0 || dim_order == 0x0 ) {
+      throw std::runtime_error("permute(): invalid pointer provided");;
+    }    
+
+    //Check ordering array
+    if (dim_order->size() > in->get_number_of_dimensions()) {
+      throw std::runtime_error("permute(): invalid length of dimension ordering array");;
+    }
+
+    std::vector<size_t> dim_count(in->get_number_of_dimensions(),0);
+    for (unsigned int i = 0; i < dim_order->size(); i++) {
+      if ((*dim_order)[i] >= in->get_number_of_dimensions()) {
+        throw std::runtime_error("permute(): invalid dimension order array");;
+      }
+      dim_count[(*dim_order)[i]]++;
+    }
+
+    //Create an internal array to store the dimensions
+    std::vector<size_t> dim_order_int;
+
+    //Check that there are no duplicate dimensions
+    for (unsigned int i = 0; i < dim_order->size(); i++) {
+      if (dim_count[(*dim_order)[i]] != 1) {
+        throw std::runtime_error("permute(): invalid dimension order array (duplicates)");;
+      }
+      dim_order_int.push_back((*dim_order)[i]);
+    }
+
+    for (unsigned int i = 0; i < dim_order_int.size(); i++) {
+      if ((*in->get_dimensions())[dim_order_int[i]] != out->get_size(i)) {
+        throw std::runtime_error("permute(): dimensions of output array do not match the input array");;
+      }
+    }
+
+    //Pad dimension order array with dimension not mentioned in order array
+    if (dim_order_int.size() < in->get_number_of_dimensions()) {
+      for (unsigned int i = 0; i < dim_count.size(); i++) {
+        if (dim_count[i] == 0) {
+          dim_order_int.push_back(i);
+        }
+      }
+    }    
+    cuNDArray_permute(in, out, &dim_order_int, shift_mode);
+  }
+
+  template<class T> boost::shared_ptr< cuNDArray<T> >
+  shift_dim( cuNDArray<T> *in, int shift )
+  {
+    if( in == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid input pointer provided");;
+    }    
+
+    std::vector<size_t> order;
+    for (int i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<unsigned int>((i+shift)%in->get_number_of_dimensions()));
+    }
+    return permute(in, &order);
+  }
+
+  template<class T> 
+  void shift_dim( cuNDArray<T> *in, cuNDArray<T> *out, int shift )
+  {
+    if( in == 0x0 || out == 0x0 ) {
+      throw std::runtime_error("shift_dim(): invalid pointer provided");;
+    }    
+
+    std::vector<size_t> order;
+    for (int i = 0; i < in->get_number_of_dimensions(); i++) {
+      order.push_back(static_cast<unsigned int>((i+shift)%in->get_number_of_dimensions()));
+    }
+    permute(in,out,&order);
+  }
+
+  // Expand
+  //
+  template<class T> 
+  __global__ void expand_kernel( 
+                                T *in, T *out, 
+                                unsigned int number_of_elements_in, unsigned int number_of_elements_out, unsigned int new_dim_size )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;    
+    if( idx < number_of_elements_out ){
+      out[idx] = in[idx%number_of_elements_in];
+    }
+  }
+
+  // Expand
+  //
+  template<class T> boost::shared_ptr< cuNDArray<T> > 
+  expand( cuNDArray<T> *in, size_t new_dim_size )
+  {
+    unsigned int number_of_elements_out = in->get_number_of_elements()*new_dim_size;
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( number_of_elements_out, &blockDim, &gridDim );
+
+    // Find element stride
+    std::vector<size_t> dims = *in->get_dimensions();
+    dims.push_back(new_dim_size);
+
+    // Invoke kernel
+    boost::shared_ptr< cuNDArray<T> > out( new cuNDArray<T>());
+    out->create(&dims);
+
+    expand_kernel<T><<< gridDim, blockDim >>>( in->get_data_ptr(), out->get_data_ptr(), 
+                                               in->get_number_of_elements(), number_of_elements_out, new_dim_size );
+
+    CHECK_FOR_CUDA_ERROR();    
+    return out;
+  }
+
+  // Crop
+  template<class T, unsigned int D> __global__ void crop_kernel
+  ( vector_td<unsigned int,D> offset, vector_td<unsigned int,D> matrix_size_in, vector_td<unsigned int,D> matrix_size_out,
+    T *in, T *out, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int frame_offset = idx/num_elements;
+    
+    if( idx < num_elements*num_batches ){
+      const typename uintd<D>::Type co = idx_to_co<D>( idx-frame_offset*num_elements, matrix_size_out );
+      const typename uintd<D>::Type co_os = offset + co;
+      const unsigned int in_idx = co_to_idx<D>(co_os, matrix_size_in)+frame_offset*prod(matrix_size_in);
+      out[idx] = in[in_idx];
+    }
+  }
+
+  // Crop
+  template<class T, unsigned int D>
+  void crop( typename uint64d<D>::Type offset, cuNDArray<T> *in, cuNDArray<T> *out )
+  {
+    if( in == 0x0 || out == 0x0 ){
+      throw std::runtime_error("crop: 0x0 ndarray provided");;
+    }
+
+    if( in->get_number_of_dimensions() != out->get_number_of_dimensions() ){
+      throw std::runtime_error("crop: image dimensions mismatch");;
+    }
+
+    if( in->get_number_of_dimensions() < D ){
+      std::stringstream ss;
+      ss << "crop: number of image dimensions should be at least " << D;
+      throw std::runtime_error(ss.str());;
+    }
+
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+        number_of_batches *= in->get_size(d);
+      }
+
+           if( weak_greater(offset+matrix_size_out, matrix_size_in) ){
+             throw std::runtime_error( "crop: cropping size mismatch");
+           }
+
+           // Setup block/grid dimensions
+           dim3 blockDim; dim3 gridDim;
+         setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+         // Invoke kernel
+         crop_kernel<T,D><<< gridDim, blockDim >>>
+           ( vector_td<unsigned int,D>(offset), vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+           in->get_data_ptr(), out->get_data_ptr(), number_of_batches, prod(matrix_size_out) );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> > 
+  crop( typename uint64d<D>::Type offset, typename uint64d<D>::Type size, cuNDArray<T> *in )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("crop: 0x0 array provided");;
+    }
+    std::vector<size_t> dims = to_std_vector(size);
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      dims.push_back(in->get_size(d));
+    }
+    boost::shared_ptr< cuNDArray<T> > result( new cuNDArray<T>(&dims) );
+    crop<T,D>(offset, in, result.get());
+    return result;
+  }  
+
+  // Expand and zero fill
+  template<class T, unsigned int D> 
+  __global__ void pad_kernel( vector_td<unsigned int,D> matrix_size_in, vector_td<unsigned int,D> matrix_size_out,
+                              T *in, T *out, unsigned int number_of_batches, unsigned int num_elements, T val )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int frame_offset = idx/num_elements;
+
+    if( idx < num_elements*number_of_batches ){
+
+      const typename uintd<D>::Type co_out = idx_to_co<D>( idx-frame_offset*num_elements, matrix_size_out );
+      const typename uintd<D>::Type offset = (matrix_size_out-matrix_size_in)>>1;
+      T _out;
+      bool inside = (co_out>=offset) && (co_out<(matrix_size_in+offset));
+
+      if( inside )
+        _out = in[co_to_idx<D>(co_out-offset, matrix_size_in)+frame_offset*prod(matrix_size_in)];
+      else{      
+        _out = val;
+      }
+
+      out[idx] = _out;
+    }
+  }
+
+  template<class T, unsigned int D> 
+  void pad( cuNDArray<T> *in, cuNDArray<T> *out, T val )
+  { 
+    if( in == 0x0 || out == 0x0 ){
+      throw std::runtime_error("pad: 0x0 ndarray provided");;
+    }
+
+    if( in->get_number_of_dimensions() != out->get_number_of_dimensions() ){
+      throw std::runtime_error("pad: image dimensions mismatch");;
+    }
+
+    if( in->get_number_of_dimensions() < D ){
+      std::stringstream ss;
+      ss << "pad: number of image dimensions should be at least " << D;
+      throw std::runtime_error(ss.str());;
+    }
+
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      number_of_batches *= in->get_size(d);
+    }
+
+    if( weak_greater(matrix_size_in,matrix_size_out) ){
+      throw std::runtime_error("pad: size mismatch, cannot expand");
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+    // Invoke kernel
+    pad_kernel<T,D><<< gridDim, blockDim >>> 
+      ( vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+        in->get_data_ptr(), out->get_data_ptr(), number_of_batches, prod(matrix_size_out), val );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> >
+  pad( typename uint64d<D>::Type size, cuNDArray<T> *in, T val )
+  {
+    if( in == 0x0 ){
+      throw std::runtime_error("pad: 0x0 array provided");;
+    }
+    std::vector<size_t> dims = to_std_vector(size);
+    for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ ){
+      dims.push_back(in->get_size(d));
+    }
+    boost::shared_ptr< cuNDArray<T> > result( new cuNDArray<T>(&dims) );
+    pad<T,D>(in, result.get(), val);
+    return result;
+  }
+
+  template<class T, unsigned int D> 
+  __global__ void fill_border_kernel( vector_td<unsigned int,D> matrix_size_in, vector_td<unsigned int,D> matrix_size_out,
+                                      T *image, unsigned int number_of_batches, unsigned int number_of_elements, T val )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    if( idx < number_of_elements ){
+      const vector_td<unsigned int,D> co_out = idx_to_co<D>( idx, matrix_size_out );
+      const vector_td<unsigned int,D> offset = (matrix_size_out-matrix_size_in)>>1;
+      if( weak_less( co_out, offset ) || weak_greater_equal( co_out, matrix_size_in+offset ) ){
+	      for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+          image[idx+batch*number_of_elements] = val;
+        }
+      }
+      else
+	      ; // do nothing
+    }
+  }
+
+  // Zero fill border (rectangular)
+  template<class T, unsigned int D> 
+  void fill_border( typename uint64d<D>::Type matrix_size_in, cuNDArray<T> *in_out, T val )
+  { 
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *in_out->get_dimensions() );
+
+    if( weak_greater(matrix_size_in, matrix_size_out) ){
+      throw std::runtime_error("fill_border: size mismatch, cannot zero fill");;
+    }
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in_out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= in_out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim );
+
+    // Invoke kernel
+    fill_border_kernel<T,D><<< gridDim, blockDim >>>
+      ( vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+        in_out->get_data_ptr(), number_of_batches, prod(matrix_size_out), val );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+
+  template<class T, unsigned int D>
+  __global__ void fill_border_kernel( typename realType<T>::Type radius, vector_td<int,D> matrix_size,
+                                      T *image, unsigned int number_of_batches, unsigned int number_of_elements, T val )
+  {
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    if( idx < number_of_elements ){
+      const vector_td<typename realType<T>::Type,D> co_out( (matrix_size>>1) - idx_to_co<D>( idx, matrix_size ));
+      if(  norm(co_out) > radius ){
+	      for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+          image[idx+batch*number_of_elements] = val;
+        }
+      }
+      else
+	      ; // do nothing
+    }
+  }
+
+  // Zero fill border (radial)
+  template<class T, unsigned int D>
+  void fill_border( typename realType<T>::Type radius, cuNDArray<T> *in_out, T val )
+  {
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *in_out->get_dimensions() );
+
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<in_out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= in_out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim );
+
+    // Invoke kernel
+    fill_border_kernel<T,D><<< gridDim, blockDim >>>
+      (radius, vector_td<int,D>(matrix_size_out),
+        in_out->get_data_ptr(), number_of_batches, prod(matrix_size_out), val );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+  template<class T, unsigned int D> __global__ void 
+  upsample_kernel( typename uintd<D>::Type matrix_size_in,
+                   typename uintd<D>::Type matrix_size_out,
+                   unsigned int num_batches,
+                   T *image_in,
+                   T *image_out )
+  {
+    typedef typename realType<T>::Type REAL;
+    
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int num_elements_out = prod(matrix_size_out);
+    
+    if( idx < num_elements_out*num_batches ){
+      
+      const unsigned int batch = idx/num_elements_out;
+      const unsigned int batch_offset_in = batch*prod(matrix_size_in);
+      
+      const typename uintd<D>::Type co_out = idx_to_co<D>( idx-batch*num_elements_out, matrix_size_out );
+      const typename uintd<D>::Type co_in = co_out >> 1;
+      const typename uintd<D>::Type ones(1);
+      const typename uintd<D>::Type twos(2);
+      const typename uintd<D>::Type offset = co_out%twos;
+      
+      const unsigned int num_cells = 1 << D;
+      
+      T cellsum(0);
+      unsigned int count = 0;
+      
+      for( unsigned int i=0; i<num_cells; i++ ){
+        
+        const typename uintd<D>::Type stride = idx_to_co<D>( i, twos );
+        
+        if( offset >= stride ){
+          cellsum += image_in[batch_offset_in+co_to_idx(amin(co_in+stride, matrix_size_in-ones), matrix_size_in)];
+          count++;
+        }
+      }
+
+      image_out[idx] = cellsum / REAL(count);
+    }
+  }
+
+  //
+  // Linear upsampling by a factor of two (on a D-dimensional grid) 
+  // Note that this operator is the transpose of the downsampling operator below by design
+  // - based on Briggs et al, A Multigrid Tutorial 2nd edition, pp. 34-35
+  // 
+  
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> > upsample( cuNDArray<T>* in )
+	{
+    if( in == 0x0 )
+      throw std::runtime_error("upsample: illegal input pointer");
+
+    std::vector<size_t> dims_out = *in->get_dimensions();
+    for( unsigned int i=0; i<D; i++ ) dims_out[i] <<= 1;
+    boost::shared_ptr< cuNDArray<T> > out(new cuNDArray<T>(&dims_out));
+    upsample<T,D>( in, out.get() );
+    return out;
+	}
+
+  template<class T, unsigned int D> void upsample( cuNDArray<T> *in, cuNDArray<T> *out )
+  {
+    if( in == 0x0 || out == 0x0 )
+      throw std::runtime_error("upsample: illegal input pointer");
+
+    typename uint64d<D>::Type matrix_size_in  = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    if( (matrix_size_in<<1) != matrix_size_out ){
+      throw std::runtime_error("upsample: arrays do not correspond to upsampling by a factor of two");
+    }
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+    // Invoke kernel
+    upsample_kernel<T,D><<< gridDim, blockDim >>>
+      ( vector_td<unsigned int,D>(matrix_size_in), vector_td<unsigned int,D>(matrix_size_out),
+        number_of_batches, in->get_data_ptr(), out->get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();    
+  }
+  //
+  // Linear downsampling by a factor of two (on a D-dimensional grid)
+  // Note that this operator is the transpose of the upsampling operator above by design
+  // - based on Briggs et al, A Multigrid Tutorial 2nd edition, pp. 36.
+  // 
+
+  template<class T, unsigned int D> __global__ void 
+  downsample_kernel( typename intd<D>::Type matrix_size_in,
+                     typename intd<D>::Type matrix_size_out,
+                     int num_batches,
+                     T *image_in,
+                     T *image_out )
+  {
+    typedef typename realType<T>::Type REAL;
+    
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const int num_elements_out = prod(matrix_size_out);
+    
+    if( idx < num_elements_out*num_batches ){
+      
+      const int batch = idx/num_elements_out;
+      const int batch_offset_in = batch*prod(matrix_size_in);
+      
+      const typename intd<D>::Type co_out = idx_to_co<D>( idx-batch*num_elements_out, matrix_size_out );
+      const typename intd<D>::Type co_in = co_out << 1;
+      
+      T cellsum[D+1];
+      for( unsigned int d=0; d<D+1; d++ ){
+        cellsum[d] = T(0);
+      }
+      
+      //const int num_cells = pow(3,D); // no pow for integers on device
+      int num_cells = 1; 
+      for( int i=0; i<D; i++ ) num_cells *=3;
+
+      const REAL denominator = pow(REAL(4),REAL(D));
+      
+      for( int i=0; i<num_cells; i++ ){
+        
+        const typename intd<D>::Type zeros(0);
+        const typename intd<D>::Type ones(1);
+        const typename intd<D>::Type threes(3);
+        const typename intd<D>::Type stride = idx_to_co<D>(i,threes)-ones; // in the range [-1;1]^D
+        
+        int distance = 0;
+        for( int d=0; d<D; d++ ){
+          if( abs(stride[d])>0 )
+            distance++;
+        }
+        
+        cellsum[distance] += image_in[batch_offset_in+co_to_idx(amax(zeros, amin(matrix_size_in-ones,co_in+stride)), matrix_size_in)];
+      }
+      
+      T res = T(0);
+      
+      for( unsigned int d=0; d<D+1; d++ ){
+        res += (REAL(1<<(D-d))*cellsum[d]);
+      }
+      
+      image_out[idx] = res / denominator;
+    }
+  }
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> > downsample( cuNDArray<T>* in )
+  {
+    if( in == 0x0 )
+      throw std::runtime_error("downsample: illegal input pointer");
+    
+    std::vector<size_t> dims_out = *in->get_dimensions();
+    for( unsigned int i=0; i<D; i++ ) dims_out[i] >>= 1;
+    boost::shared_ptr< cuNDArray<T> > out(new cuNDArray<T>(&dims_out));
+    downsample<T,D>( in, out.get() );
+    return out;
+  }
+
+  template<class T, unsigned int D> void downsample( cuNDArray<T> *in, cuNDArray<T> *out )
+  {
+    if( in == 0x0 || out == 0x0 )
+      throw std::runtime_error("downsample: illegal input pointer");
+
+    typename uint64d<D>::Type matrix_size_in  = from_std_vector<size_t,D>( *in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = from_std_vector<size_t,D>( *out->get_dimensions() );
+
+    if( (matrix_size_in>>1) != matrix_size_out ){
+      throw std::runtime_error("downsample: arrays do not correspond to downsampling by a factor of two");
+    }
+
+    unsigned int number_of_batches = 1;
+    for( unsigned int d=D; d<out->get_number_of_dimensions(); d++ ){
+      number_of_batches *= out->get_size(d);
+    }
+
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size_out), &blockDim, &gridDim, number_of_batches );
+
+    // Invoke kernel
+    downsample_kernel<T,D><<< gridDim, blockDim >>>
+      ( vector_td<int,D>(matrix_size_in), vector_td<int,D>(matrix_size_out),
+        (int)number_of_batches, in->get_data_ptr(), out->get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();    
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > permute( cuNDArray<float>*, std::vector<size_t>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > permute( cuNDArray<double>*, std::vector<size_t>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > permute( cuNDArray<float_complext>*, std::vector<size_t>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > permute( cuNDArray<double_complext>*, std::vector<size_t>*, int );  
+
+  template EXPORTGPUCORE void permute( cuNDArray<float>*, cuNDArray<float>*, std::vector<size_t>*, int);
+  template EXPORTGPUCORE void permute( cuNDArray<double>*, cuNDArray<double>*, std::vector<size_t>*, int);
+  template EXPORTGPUCORE void permute( cuNDArray<float_complext>*, cuNDArray<float_complext>*, std::vector<size_t>*, int);
+  template EXPORTGPUCORE void permute( cuNDArray<double_complext>*, cuNDArray<double_complext>*, std::vector<size_t>*, int);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > shift_dim( cuNDArray<float>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > shift_dim( cuNDArray<double>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > shift_dim( cuNDArray<float_complext>*, int );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > shift_dim( cuNDArray<double_complext>*, int );
+
+  template EXPORTGPUCORE void shift_dim( cuNDArray<float>*, cuNDArray<float>*, int shift );
+  template EXPORTGPUCORE void shift_dim( cuNDArray<double>*, cuNDArray<double>*, int shift );
+  template EXPORTGPUCORE void shift_dim( cuNDArray<float_complext>*, cuNDArray<float_complext>*, int shift );
+  template EXPORTGPUCORE void shift_dim( cuNDArray<double_complext>*, cuNDArray<double_complext>*, int shift );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > expand<float>( cuNDArray<float>*, size_t);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > expand<double>( cuNDArray<double>*, size_t);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > expand<float_complext>( cuNDArray<float_complext>*, size_t);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > expand<double_complext>( cuNDArray<double_complext>*, size_t);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > crop<float,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<float>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > crop<float_complext,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE void crop<float,1>( uint64d1, cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void crop<float,2>( uint64d2, cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void crop<float,3>( uint64d3, cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void crop<float,4>( uint64d4, cuNDArray<float>*, cuNDArray<float>*);
+
+  template EXPORTGPUCORE void crop<complext<float>,1>( uint64d1, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUCORE void crop<complext<float>,2>( uint64d2, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUCORE void crop<complext<float>,3>( uint64d3, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUCORE void crop<complext<float>,4>( uint64d4, cuNDArray<complext<float> >*, cuNDArray< complext<float> >*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,1>( typename uint64d<1>::Type, cuNDArray<float>*, float );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,2>( typename uint64d<2>::Type, cuNDArray<float>*, float );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,3>( typename uint64d<3>::Type, cuNDArray<float>*, float );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > pad<float,4>( typename uint64d<4>::Type, cuNDArray<float>*, float );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,1>( typename uint64d<1>::Type, cuNDArray<float_complext>*, float_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,2>( typename uint64d<2>::Type, cuNDArray<float_complext>*, float_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,3>( typename uint64d<3>::Type, cuNDArray<float_complext>*, float_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > pad<float_complext,4>( typename uint64d<4>::Type, cuNDArray<float_complext>*, float_complext );
+
+  template EXPORTGPUCORE void pad<float,1>( cuNDArray<float>*, cuNDArray<float>*, float);
+  template EXPORTGPUCORE void pad<float,2>( cuNDArray<float>*, cuNDArray<float>*, float);
+  template EXPORTGPUCORE void pad<float,3>( cuNDArray<float>*, cuNDArray<float>*, float);
+  template EXPORTGPUCORE void pad<float,4>( cuNDArray<float>*, cuNDArray<float>*, float);
+
+  template EXPORTGPUCORE void pad<float_complext,1>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);
+  template EXPORTGPUCORE void pad<float_complext,2>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);  
+  template EXPORTGPUCORE void pad<float_complext,3>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);
+  template EXPORTGPUCORE void pad<float_complext,4>( cuNDArray<float_complext>*, cuNDArray<float_complext>*, float_complext);
+
+  template EXPORTGPUCORE void fill_border<float,1>(uint64d1, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,2>(uint64d2, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,3>(uint64d3, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,4>(uint64d4, cuNDArray<float>*,float);
+  template EXPORTGPUCORE void fill_border<float,1>(float, cuNDArray<float>*,float);
+	template EXPORTGPUCORE void fill_border<float,2>(float, cuNDArray<float>*,float);
+	template EXPORTGPUCORE void fill_border<float,3>(float, cuNDArray<float>*,float);
+	template EXPORTGPUCORE void fill_border<float,4>(float, cuNDArray<float>*,float);
+
+  template EXPORTGPUCORE void fill_border<float_complext,1>(uint64d1, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,2>(uint64d2, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,3>(uint64d3, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,4>(uint64d4, cuNDArray<float_complext>*,float_complext);
+  template EXPORTGPUCORE void fill_border<float_complext,1>(float, cuNDArray<float_complext>*,float_complext);
+	template EXPORTGPUCORE void fill_border<float_complext,2>(float, cuNDArray<float_complext>*,float_complext);
+	template EXPORTGPUCORE void fill_border<float_complext,3>(float, cuNDArray<float_complext>*,float_complext);
+	template EXPORTGPUCORE void fill_border<float_complext,4>(float, cuNDArray<float_complext>*,float_complext);
+
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > crop<double,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<double>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,1>( typename uint64d<1>::Type, typename uint64d<1>::Type, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,2>( typename uint64d<2>::Type, typename uint64d<2>::Type, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,3>( typename uint64d<3>::Type, typename uint64d<3>::Type, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > crop<double_complext,4>( typename uint64d<4>::Type, typename uint64d<4>::Type, cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE void crop<double,1>( uint64d1, cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void crop<double,2>( uint64d2, cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void crop<double,3>( uint64d3, cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void crop<double,4>( uint64d4, cuNDArray<double>*, cuNDArray<double>*);
+
+  template EXPORTGPUCORE void crop<complext<double>,1>( uint64d1, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUCORE void crop<complext<double>,2>( uint64d2, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUCORE void crop<complext<double>,3>( uint64d3, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUCORE void crop<complext<double>,4>( uint64d4, cuNDArray<complext<double> >*, cuNDArray< complext<double> >*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,1>( typename uint64d<1>::Type, cuNDArray<double>*, double );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,2>( typename uint64d<2>::Type, cuNDArray<double>*, double );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,3>( typename uint64d<3>::Type, cuNDArray<double>*, double );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > pad<double,4>( typename uint64d<4>::Type, cuNDArray<double>*, double );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,1>( typename uint64d<1>::Type, cuNDArray<double_complext>*, double_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,2>( typename uint64d<2>::Type, cuNDArray<double_complext>*, double_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,3>( typename uint64d<3>::Type, cuNDArray<double_complext>*, double_complext );
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > pad<double_complext,4>( typename uint64d<4>::Type, cuNDArray<double_complext>*, double_complext );
+
+  template EXPORTGPUCORE void pad<double,1>( cuNDArray<double>*, cuNDArray<double>*, double);
+  template EXPORTGPUCORE void pad<double,2>( cuNDArray<double>*, cuNDArray<double>*, double);
+  template EXPORTGPUCORE void pad<double,3>( cuNDArray<double>*, cuNDArray<double>*, double);
+  template EXPORTGPUCORE void pad<double,4>( cuNDArray<double>*, cuNDArray<double>*, double);
+
+  template EXPORTGPUCORE void pad<double_complext,1>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);
+  template EXPORTGPUCORE void pad<double_complext,2>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);  
+  template EXPORTGPUCORE void pad<double_complext,3>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);
+  template EXPORTGPUCORE void pad<double_complext,4>( cuNDArray<double_complext>*, cuNDArray<double_complext>*, double_complext);
+
+  template EXPORTGPUCORE void fill_border<double,1>(uint64d1, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,2>(uint64d2, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,3>(uint64d3, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,4>(uint64d4, cuNDArray<double>*,double);
+  template EXPORTGPUCORE void fill_border<double,1>(double, cuNDArray<double>*,double);
+	template EXPORTGPUCORE void fill_border<double,2>(double, cuNDArray<double>*,double);
+	template EXPORTGPUCORE void fill_border<double,3>(double, cuNDArray<double>*,double);
+	template EXPORTGPUCORE void fill_border<double,4>(double, cuNDArray<double>*,double);
+
+  template EXPORTGPUCORE void fill_border<double_complext,1>(uint64d1, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,2>(uint64d2, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,3>(uint64d3, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,4>(uint64d4, cuNDArray<double_complext>*,double_complext);
+  template EXPORTGPUCORE void fill_border<double_complext,1>(double, cuNDArray<double_complext>*,double_complext);
+	template EXPORTGPUCORE void fill_border<double_complext,2>(double, cuNDArray<double_complext>*,double_complext);
+	template EXPORTGPUCORE void fill_border<double_complext,3>(double, cuNDArray<double_complext>*,double_complext);
+	template EXPORTGPUCORE void fill_border<double_complext,4>(double, cuNDArray<double_complext>*,double_complext);
+
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,1>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,2>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,3>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > upsample<float,4>(cuNDArray<float>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,1>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,2>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,3>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > upsample<float_complext,4>(cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,1>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,2>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,3>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > upsample<double,4>(cuNDArray<double>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,1>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,2>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,3>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > upsample<double_complext,4>(cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE void upsample<float,1>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void upsample<float,2>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void upsample<float,3>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void upsample<float,4>(cuNDArray<float>*, cuNDArray<float>*);
+
+  template EXPORTGPUCORE void upsample<float_complext,1>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void upsample<float_complext,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void upsample<float_complext,3>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void upsample<float_complext,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE void upsample<double,1>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void upsample<double,2>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void upsample<double,3>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void upsample<double,4>(cuNDArray<double>*, cuNDArray<double>*);
+
+  template EXPORTGPUCORE void upsample<double_complext,1>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void upsample<double_complext,2>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void upsample<double_complext,3>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void upsample<double_complext,4>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,1>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,2>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,3>(cuNDArray<float>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> > downsample<float,4>(cuNDArray<float>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,1>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,2>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,3>(cuNDArray<float_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float_complext> > downsample<float_complext,4>(cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,1>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,2>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,3>(cuNDArray<double>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> > downsample<double,4>(cuNDArray<double>*);
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,1>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,2>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,3>(cuNDArray<double_complext>*);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double_complext> > downsample<double_complext,4>(cuNDArray<double_complext>*);
+
+  template EXPORTGPUCORE void downsample<float,1>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void downsample<float,2>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void downsample<float,3>(cuNDArray<float>*, cuNDArray<float>*);
+  template EXPORTGPUCORE void downsample<float,4>(cuNDArray<float>*, cuNDArray<float>*);
+
+  template EXPORTGPUCORE void downsample<float_complext,1>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void downsample<float_complext,2>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void downsample<float_complext,3>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+  template EXPORTGPUCORE void downsample<float_complext,4>(cuNDArray<float_complext>*, cuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE void downsample<double,1>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void downsample<double,2>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void downsample<double,3>(cuNDArray<double>*, cuNDArray<double>*);
+  template EXPORTGPUCORE void downsample<double,4>(cuNDArray<double>*, cuNDArray<double>*);
+
+  template EXPORTGPUCORE void downsample<double_complext,1>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void downsample<double_complext,2>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void downsample<double_complext,3>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+  template EXPORTGPUCORE void downsample<double_complext,4>(cuNDArray<double_complext>*, cuNDArray<double_complext>*);
+
+
+  // We can probably instantiate the functions below functionsfor many more types? E.g. arrays of floatd2. 
+  // For now we just introduce what we have needed...
+  //
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<floatd2> > expand<floatd2>( cuNDArray<floatd2>*, size_t);  
+}
diff --git a/toolboxes/core/gpu/cuNDArray_utils.h b/toolboxes/core/gpu/cuNDArray_utils.h
new file mode 100644
index 0000000..ebe49e9
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDArray_utils.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+  shift_dim( cuNDArray<T> *in, int shift );
+
+  template<class T> EXPORTGPUCORE void
+  shift_dim( cuNDArray<T> *in, cuNDArray<T> *out, int shift );
+  
+  template<class T> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+  permute( cuNDArray<T> *in, std::vector<size_t> *dim_order, int shift_mode = 0 );
+  
+  template<class T> EXPORTGPUCORE void
+  permute( cuNDArray<T> *in, cuNDArray<T> *out, std::vector<size_t> *dim_order, int shift_mode = 0 );
+
+  template<class T, unsigned int D> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+  crop( typename uint64d<D>::Type crop_offset, typename uint64d<D>::Type crop_size, cuNDArray<T> *in );
+
+  template<class T, unsigned int D> EXPORTGPUCORE
+  void crop( typename uint64d<D>::Type crop_offset, cuNDArray<T> *in, cuNDArray<T> *out );
+  
+  template<class T, unsigned int D> EXPORTGPUCORE boost::shared_ptr< cuNDArray<T> >
+  pad( typename uint64d<D>::Type size, cuNDArray<T> *in, T val = T(0) );
+
+  template<class T, unsigned int D> EXPORTGPUCORE
+  void pad( cuNDArray<T> *in, cuNDArray<T> *out, T val = T(0) );
+  
+  template<class T, unsigned int D> EXPORTGPUCORE
+  void fill_border( typename uint64d<D>::Type matrix_size, cuNDArray<T> *image, T val = T(0) );
+
+  /***
+   * @brief Fills the image with a given value outside a radius from the center
+   * @param radius
+   * @param in_out
+   * @param val
+   */
+  template<class T, unsigned int D>
+  void fill_border( typename realType<T>::Type radius, cuNDArray<T> *in_out, T val= T(0) );
+
+  // Expand array to new dimension
+  template<class T> EXPORTGPUCORE boost::shared_ptr<cuNDArray<T> > 
+  expand(cuNDArray<T> *data, size_t added_dim_size );
+  
+  template<class T, unsigned int D> EXPORTGPUCORE 
+  boost::shared_ptr< cuNDArray<T> > upsample( cuNDArray<T>* in );
+
+  template<class T, unsigned int D> EXPORTGPUCORE
+  void upsample( cuNDArray<T> *in, cuNDArray<T> *out );
+
+  template<class T, unsigned int D> EXPORTGPUCORE 
+  boost::shared_ptr< cuNDArray<T> > downsample( cuNDArray<T>* in );
+
+  template<class T, unsigned int D> EXPORTGPUCORE
+  void downsample( cuNDArray<T> *in, cuNDArray<T> *out );
+}
diff --git a/toolboxes/core/gpu/cuNDFFT.cpp b/toolboxes/core/gpu/cuNDFFT.cpp
new file mode 100644
index 0000000..c09b9dd
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDFFT.cpp
@@ -0,0 +1,156 @@
+#include "cuNDFFT.h"
+#include "vector_td.h"
+#include "cuNDArray.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_operators.h"
+
+#include <cufft.h>
+#include <cuComplex.h>
+#include <sstream>
+
+namespace Gadgetron{
+
+
+	template<class T> cuNDFFT<T>* cuNDFFT<T>::instance(){
+		if (!__instance)
+			__instance = new cuNDFFT<T>;
+		return __instance;
+	}
+
+	template<class T> cuNDFFT<T>* cuNDFFT<T>::__instance = NULL;
+  template<class T> cufftType_t get_transform_type();
+  template<> cufftType_t get_transform_type<float>() { return CUFFT_C2C; }
+  template<> cufftType_t get_transform_type<double>() { return CUFFT_Z2Z; }
+  
+  template<class T> cufftResult_t cuNDA_FFT_execute( cufftHandle plan, cuNDArray< complext<T> > *in_out, int direction );
+  
+  template<> cufftResult_t cuNDA_FFT_execute<float>( cufftHandle plan, cuNDArray<float_complext> *in_out, int direction ){
+    return cufftExecC2C(plan, (cuFloatComplex*)in_out->get_data_ptr(), (cuFloatComplex*)in_out->get_data_ptr(), direction); }
+
+  template<> cufftResult_t cuNDA_FFT_execute<double>( cufftHandle plan, cuNDArray<double_complext> *in_out, int direction ){
+    return cufftExecZ2Z(plan, (cuDoubleComplex*)in_out->get_data_ptr(), (cuDoubleComplex*)in_out->get_data_ptr(), direction); }
+  
+  template<class T> void
+  cuNDFFT<T>::fft_int( cuNDArray< complext<T> > *input, std::vector<size_t> *dims_to_transform, int direction, bool do_scale )
+  {
+    std::vector<size_t> new_dim_order;
+    std::vector<size_t> reverse_dim_order;
+    std::vector<size_t> dims;
+    std::vector<size_t> dim_count(input->get_number_of_dimensions(),0);
+    
+    size_t array_ndim = input->get_number_of_dimensions();
+    boost::shared_ptr< std::vector<size_t> > array_dims = input->get_dimensions();
+    
+    dims = std::vector<size_t>(dims_to_transform->size(),0);
+    for (size_t i = 0; i < dims_to_transform->size(); i++) {
+      if ((*dims_to_transform)[i] >= array_ndim) {
+    	std::stringstream ss;
+    	ss << "cuNDFFT::fft Invalid dimensions specified for transform " << (*dims_to_transform)[i] << "max " << array_ndim;
+	throw std::runtime_error(ss.str());;
+      }
+      if (dim_count[(*dims_to_transform)[i]] > 0) {
+	throw std::runtime_error("cuNDFFT::fft Invalid dimensions (duplicates) specified for transform");;
+      }
+      dim_count[(*dims_to_transform)[i]]++;
+      dims[dims_to_transform->size()-1-i] = (*array_dims)[(*dims_to_transform)[i]];
+    }
+    
+    new_dim_order = *dims_to_transform;
+    for (size_t i = 0; i < array_ndim; i++) {
+      if (!dim_count[i]) new_dim_order.push_back(i);
+    }
+    
+    reverse_dim_order = std::vector<size_t>(array_ndim,0);
+    for (size_t i = 0; i < array_ndim; i++) {
+      reverse_dim_order[new_dim_order[i]] = i;
+    }
+    
+    size_t ndim = dims.size();
+    size_t batches = 0;
+    size_t elements_in_ft = 1;
+    for (size_t i = 0; i < dims.size(); i++) 
+      elements_in_ft *= dims[i];
+    batches = input->get_number_of_elements() / elements_in_ft;
+    
+    cufftHandle plan;
+    cufftResult ftres;
+    
+    std::vector<int> int_dims;
+    for( unsigned int i=0; i<dims.size(); i++ )
+      int_dims.push_back((int)dims[i]);
+
+    ftres = cufftPlanMany(&plan,ndim,&int_dims[0], &int_dims[0], 1, elements_in_ft, &int_dims[0], 1, elements_in_ft, get_transform_type<T>(), batches);
+    if (ftres != CUFFT_SUCCESS) {
+      std::stringstream ss;
+      ss << "cuNDFFT FFT plan failed: " << ftres;
+      throw std::runtime_error(ss.str());;
+    }
+    
+    //IFFTSHIFT
+    *input = *permute(input,&new_dim_order,-1);
+    
+    if( cuNDA_FFT_execute<T>( plan, input, direction ) != CUFFT_SUCCESS ) {
+      throw std::runtime_error("cuNDFFT FFT execute failed");;
+    }
+    
+    ftres = cufftDestroy( plan );
+    if (ftres != CUFFT_SUCCESS) {
+      std::stringstream ss;
+      ss << "cuNDFFT FFT plan destroy failed: " << ftres;
+      throw std::runtime_error(ss.str());;
+    }
+    
+    if (do_scale) {
+      *input /= T(elements_in_ft);
+    }
+    
+    //FFTSHIFT 
+    *input = *permute(input,&reverse_dim_order,1);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::fft( cuNDArray< complext<T> > *input, std::vector<size_t> *dims_to_transform )
+  {
+    fft_int(input, dims_to_transform, CUFFT_FORWARD, false);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::ifft( cuNDArray< complext<T> > *input, std::vector<size_t> *dims_to_transform, bool do_scale )
+  {
+    fft_int(input, dims_to_transform, CUFFT_INVERSE, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::fft( cuNDArray< complext<T> > *input, unsigned int dim_to_transform )
+  {
+    std::vector<size_t> dims(1,dim_to_transform);
+    fft_int(input, &dims, CUFFT_FORWARD, false);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::ifft( cuNDArray< complext<T> > *input, unsigned int dim_to_transform, bool do_scale )
+  {
+    std::vector<size_t> dims(1,dim_to_transform);
+    fft_int(input, &dims, CUFFT_INVERSE, do_scale);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::fft( cuNDArray< complext<T> > *input )
+  {
+    std::vector<size_t> dims(input->get_number_of_dimensions(),0);
+    for (size_t i = 0; i < dims.size(); i++) dims[i] = i;
+    fft_int(input, &dims, CUFFT_FORWARD, false);
+  }
+  
+  template<class T> void
+  cuNDFFT<T>::ifft( cuNDArray<complext<T> > *input, bool do_scale )
+  {
+    std::vector<size_t> dims(input->get_number_of_dimensions(),0);
+    for (size_t i = 0; i < dims.size(); i++) dims[i] = i;
+    fft_int(input, &dims, CUFFT_INVERSE, do_scale);
+  }
+  
+  // Instantiation
+  template class EXPORTGPUCORE cuNDFFT<float>;
+  template class EXPORTGPUCORE cuNDFFT<double>;
+}
diff --git a/toolboxes/core/gpu/cuNDFFT.h b/toolboxes/core/gpu/cuNDFFT.h
new file mode 100644
index 0000000..71a9bfd
--- /dev/null
+++ b/toolboxes/core/gpu/cuNDFFT.h
@@ -0,0 +1,49 @@
+/** \file cuNDFFT.h
+    \brief Wrapper of the CUFFT library for ndarrays of type Gadgetron::complext.
+ */
+
+#ifndef CUFFT_H
+#define CUFFT_H
+#pragma once
+
+#include "cuNDArray.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+/** \class cuNDFFT
+      \brief Wrapper of the CUFFT library for ndarrays of type complext.
+
+      Wrapper of the CUFFT library for ndarrays of type complext<REAL>.
+      The class' template type is a REAL, ie. float or double.
+ */
+template<class T> class EXPORTGPUCORE cuNDFFT
+{
+
+public:
+
+
+
+	static cuNDFFT<T>* instance();
+
+
+	void fft ( cuNDArray<complext<T> > *input, std::vector<size_t> *dims_to_transform );
+	void ifft( cuNDArray<complext<T> > *input, std::vector<size_t> *dims_to_transform, bool do_scale = true );
+
+	void fft ( cuNDArray<complext<T> > *input, unsigned int dim_to_transform);
+	void ifft( cuNDArray<complext<T> > *input, unsigned int dim_to_transform, bool do_scale = true );
+
+	void fft ( cuNDArray<complext<T> > *input );
+	void ifft( cuNDArray<complext<T> > *input, bool do_scale = true );
+
+protected:
+	void fft_int( cuNDArray<complext<T> > *input, std::vector<size_t> *dims_to_transform, int direction, bool do_scale = true );
+
+	cuNDFFT() {}
+	virtual ~cuNDFFT() {}
+	static cuNDFFT<T>* __instance;
+
+};
+}
+
+#endif
diff --git a/toolboxes/core/gpu/cudaDeviceManager.cpp b/toolboxes/core/gpu/cudaDeviceManager.cpp
new file mode 100644
index 0000000..6fda207
--- /dev/null
+++ b/toolboxes/core/gpu/cudaDeviceManager.cpp
@@ -0,0 +1,223 @@
+#include "cudaDeviceManager.h"
+#include "check_CUDA.h"
+#include "cuNDArray_blas.h"
+
+#include <boost/thread/mutex.hpp>
+#include <boost/shared_array.hpp>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <sstream>
+
+namespace Gadgetron{
+
+  static boost::shared_array<boost::mutex> _mutex;
+  cudaDeviceManager* cudaDeviceManager::_instance = 0;
+
+  cudaDeviceManager::cudaDeviceManager() {
+
+    // This constructor is executed only once for a singleton
+    //
+
+    atexit(&CleanUp);
+
+    if( cudaGetDeviceCount( &_num_devices ) != cudaSuccess) {
+      _num_devices = 0;
+      throw cuda_error( "Error: no Cuda devices present.");
+    }
+
+    _mutex = boost::shared_array<boost::mutex>(new boost::mutex[_num_devices]);
+
+    int old_device;
+    if( cudaGetDevice(&old_device) != cudaSuccess ) {
+      throw std::runtime_error( "Error: unable to get device no");
+    }
+
+    _total_global_mem = std::vector<size_t>(_num_devices,0);
+    _shared_mem_per_block = std::vector<size_t>(_num_devices,0);
+    _warp_size = std::vector<int>(_num_devices,0);
+    _max_blockdim = std::vector<int>(_num_devices,0);
+    _max_griddim = std::vector<int>(_num_devices,0);
+    _major = std::vector<int>(_num_devices,0);
+    _minor = std::vector<int>(_num_devices,0);
+    _handle = std::vector<cublasHandle_t>(_num_devices, (cublasContext*)0x0);
+
+    for( int device=0; device<_num_devices; device++ ){
+
+      if( cudaSetDevice(device) != cudaSuccess ) {
+        throw cuda_error( "Error: unable to set device no");
+      }
+
+      cudaDeviceProp deviceProp;
+
+      if( cudaGetDeviceProperties( &deviceProp, device ) != cudaSuccess) {
+        throw cuda_error("Error: unable to determine device properties.");
+      }
+
+      _total_global_mem[device] = deviceProp.totalGlobalMem;
+      _shared_mem_per_block[device] = deviceProp.sharedMemPerBlock;
+      _warp_size[device] = deviceProp.warpSize;
+      _max_blockdim[device] = deviceProp.maxThreadsDim[0];
+      _max_griddim[device] = deviceProp.maxGridSize[0];
+      _major[device] = deviceProp.major;
+      _minor[device] = deviceProp.minor;
+    }
+
+    if( cudaSetDevice(old_device) != cudaSuccess ) {
+      throw cuda_error( "Error: unable to restore device no");
+    }
+  }
+
+  cudaDeviceManager::~cudaDeviceManager() 
+  {
+
+    // TODO Auto-generated destructor stub
+
+    for (int device = 0; device < _num_devices; device++){
+      if (_handle[device] != NULL)
+        cublasDestroy(_handle[device]);
+    }
+  }
+
+  size_t cudaDeviceManager::total_global_mem()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _total_global_mem[device];
+  }
+
+  size_t cudaDeviceManager::shared_mem_per_block()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _shared_mem_per_block[device];
+  }
+
+  int cudaDeviceManager::max_blockdim()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _max_blockdim[device];
+  }
+
+  int cudaDeviceManager::max_griddim()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _max_griddim[device];
+  }
+
+  int cudaDeviceManager::warp_size()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _warp_size[device];
+  }
+
+  int cudaDeviceManager::major_version()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _major[device];
+  }
+
+  int cudaDeviceManager::minor_version()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return _minor[device];
+  }
+
+  size_t cudaDeviceManager::getFreeMemory()
+  {
+    size_t free,total;
+    CUDA_CALL(cudaMemGetInfo(&free,&total));
+    return free;
+  }
+
+  size_t cudaDeviceManager::getTotalMemory()
+  {
+    size_t free,total;
+    CUDA_CALL(cudaMemGetInfo(&free,&total));
+    return total;
+  }
+
+  size_t cudaDeviceManager::getFreeMemory(int device)
+  {
+    int oldDevice;
+    CUDA_CALL(cudaGetDevice(&oldDevice));
+    CUDA_CALL(cudaSetDevice(device));
+    size_t ret = getFreeMemory();
+    CUDA_CALL(cudaSetDevice(oldDevice));
+    return ret;
+  }
+
+  size_t cudaDeviceManager::getTotalMemory(int device)
+  {
+    int oldDevice;
+    CUDA_CALL(cudaGetDevice(&oldDevice));
+    CUDA_CALL(cudaSetDevice(device));
+    size_t ret = getTotalMemory();
+    CUDA_CALL(cudaSetDevice(oldDevice));
+    return ret;
+  }
+
+  cudaDeviceManager* cudaDeviceManager::Instance()
+  {
+    if (_instance == 0 ) _instance = new cudaDeviceManager;
+    return _instance;
+  }
+
+  cublasHandle_t cudaDeviceManager::lockHandle()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return lockHandle(device);
+  }
+
+  cublasHandle_t cudaDeviceManager::lockHandle(int device)
+  {
+    _mutex[device].lock();
+    if (_handle[device] == NULL){
+      cublasStatus_t ret = cublasCreate(&_handle[device]);
+      if (ret != CUBLAS_STATUS_SUCCESS) {
+      	std::stringstream ss;
+      	ss << "Error: unable to create cublas handle for device " << device << " : ";
+        ss << gadgetron_getCublasErrorString(ret) << std::endl;
+      	throw cuda_error(ss.str());
+      }
+      cublasSetPointerMode( _handle[device], CUBLAS_POINTER_MODE_HOST );
+    }
+    return _handle[device];
+  }
+
+  void cudaDeviceManager::unlockHandle()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return unlockHandle(device);
+  }
+
+  void cudaDeviceManager::unlockHandle(int device)
+  {
+    _mutex[device].unlock();
+  }
+
+  int cudaDeviceManager::getCurrentDevice()
+  {
+    int device;
+    CUDA_CALL(cudaGetDevice(&device));
+    return device;
+  }
+
+  int cudaDeviceManager::getTotalNumberOfDevice()
+  {
+    int number_of_devices;
+    CUDA_CALL(cudaGetDeviceCount(&number_of_devices));
+    return number_of_devices;
+  }
+
+  void cudaDeviceManager::CleanUp()
+  {
+    delete _instance; _instance = 0;
+  }
+}
diff --git a/toolboxes/core/gpu/cudaDeviceManager.h b/toolboxes/core/gpu/cudaDeviceManager.h
new file mode 100644
index 0000000..9b4af06
--- /dev/null
+++ b/toolboxes/core/gpu/cudaDeviceManager.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "gpucore_export.h"
+
+#include <vector>
+#include <cublas_v2.h>
+
+namespace Gadgetron{
+
+  class EXPORTGPUCORE cudaDeviceManager 
+  {
+  public:
+
+    // This class is used as a singleton.
+    // Use Instance() to access the public member functions.
+    //
+
+    static cudaDeviceManager* Instance();
+    
+    // Public member functions.
+    // If the function does not take a device id, it will use the current device.
+    //
+
+    inline size_t total_global_mem(int device){ return _total_global_mem[device]; }
+    inline size_t shared_mem_per_block(int device){ return _shared_mem_per_block[device]; }
+    inline int warp_size(int device){ return _warp_size[device]; }
+    inline int max_blockdim(int device){ return _max_blockdim[device]; }
+    inline int max_griddim(int device){ return _max_griddim[device]; }
+    inline int major_version(int device){ return _major[device]; }
+    inline int minor_version(int device){ return _minor[device]; }
+
+    size_t total_global_mem();
+    size_t shared_mem_per_block();
+    int major_version();
+    int minor_version();
+    int warp_size();
+    int max_blockdim();
+    int max_griddim();
+
+    int getCurrentDevice();
+
+    int getTotalNumberOfDevice();
+
+    size_t getFreeMemory();
+    size_t getFreeMemory(int device);
+
+    size_t getTotalMemory();
+    size_t getTotalMemory(int device);
+
+    // Access to Cublas is protected by a mutex
+    // Despite what the Cublas manual claims, we have not found it thread safe.
+
+    cublasHandle_t lockHandle();
+    cublasHandle_t lockHandle(int device);
+
+    void unlockHandle();
+    void unlockHandle(int device);
+
+  private:
+
+    // Use the Instance() method to access the singleton
+    //
+
+    cudaDeviceManager();
+    ~cudaDeviceManager();
+
+    static void CleanUp();
+    
+    int _num_devices;
+    std::vector<size_t> _total_global_mem; // in bytes
+    std::vector<size_t> _shared_mem_per_block; // in bytes
+    std::vector<int> _warp_size;
+    std::vector<int> _max_blockdim;
+    std::vector<int> _max_griddim;
+    std::vector<int> _major;
+    std::vector<int> _minor;
+    std::vector<cublasHandle_t> _handle;
+    static cudaDeviceManager * _instance;
+  };
+}
diff --git a/toolboxes/core/gpu/gpucore_export.h b/toolboxes/core/gpu/gpucore_export.h
new file mode 100644
index 0000000..c6d72f3
--- /dev/null
+++ b/toolboxes/core/gpu/gpucore_export.h
@@ -0,0 +1,18 @@
+/** \file gpucore_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUCORE_EXPORT_H_
+#define GPUCORE_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUCORE__) || defined (gpucore_EXPORTS)
+#define EXPORTGPUCORE __declspec(dllexport)
+#else
+#define EXPORTGPUCORE __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUCORE
+#endif
+
+#endif /* GPUCORE_EXPORT_H_ */
diff --git a/toolboxes/core/gpu/hoCuNDArray.h b/toolboxes/core/gpu/hoCuNDArray.h
new file mode 100644
index 0000000..66cead8
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray.h
@@ -0,0 +1,121 @@
+/** \file hoNDArray.h
+    \brief CPU-based N-dimensional array (data container) for cpu->gpu->cpu (hoCu) solvers.
+
+    The existence of this class is mainly due to providing unique array type for the hoCu based math in
+    hoCuNDArray_operators.h, hoCuNDArray_elemwise.h, and hoCuNDArray_blas.h.
+    Unfortunately C++ does not let a derived class inherit its base class's constructors, which consequently need redefinition.
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+  template<class T> class hoCuNDArray: public hoNDArray<T>
+  {
+  public:
+
+    hoCuNDArray() : hoNDArray<T>::hoNDArray() {}
+
+    hoCuNDArray(std::vector<size_t> *dimensions) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions);
+    }
+
+    hoCuNDArray(std::vector<size_t> &dimensions) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions);
+    }
+  
+    hoCuNDArray(boost::shared_ptr< std::vector<size_t> > dimensions) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions.get());
+    }
+  
+    hoCuNDArray(std::vector<size_t> *dimensions, T* data, bool delete_data_on_destruct = false) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions, data, delete_data_on_destruct);
+    }
+
+    hoCuNDArray(std::vector<size_t> &dimensions, T* data, bool delete_data_on_destruct = false) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions, data, delete_data_on_destruct);
+    }
+  
+    hoCuNDArray(boost::shared_ptr< std::vector<size_t> > dimensions, T* data, bool delete_data_on_destruct = false) : hoNDArray<T>::hoNDArray() {
+      this->create(dimensions.get(), data, delete_data_on_destruct);
+    }
+
+    // Copy constructors
+    hoCuNDArray(const hoNDArray<T> &a): hoNDArray<T>(){
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a.get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a.get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    hoCuNDArray(const hoNDArray<T> *a): hoNDArray<T>(){
+      if(!a) throw std::runtime_error("hoCuNDArray::hoCuNDArray(): 0x0 pointer provided.");
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a->get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a->get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    hoCuNDArray(const hoCuNDArray<T> &a): hoNDArray<T>(){
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a.get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a.get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    hoCuNDArray(const hoCuNDArray<T> *a): hoNDArray<T>(){
+      if(!a) throw std::runtime_error("hoCuNDArray::hoCuNDArray(): 0x0 pointer provided.");
+      this->data_ = 0;
+      this->dimensions_ = boost::shared_ptr< std::vector<size_t> >(new std::vector<size_t>(*a->get_dimensions()));
+      this->allocate_memory();
+      memcpy(this->data_, a->get_data_ptr(), this->elements_*sizeof(T));
+    }
+
+    virtual ~hoCuNDArray() {
+      if (this->delete_data_on_destruct_) {
+        this->deallocate_memory();
+      }
+    }
+
+    T& at( size_t idx ){
+      if( idx >= this->get_number_of_elements() ){
+        throw std::runtime_error("hoCuNDArray::at(): index out of range.");
+      }
+      return this->data_[idx];
+    }
+  
+    T& operator[]( size_t idx ){
+      if( idx >= this->get_number_of_elements() ){
+        throw std::runtime_error("hoCuNDArray::operator[]: index out of range.");
+      }
+      return this->data_[idx];
+    }
+
+  protected:
+
+    virtual void allocate_memory()
+    {
+      this->deallocate_memory();
+      this->elements_ = 1;
+      if (this->dimensions_->empty())
+        throw std::runtime_error("hoCuNDArray::allocate_memory() : dimensions is empty.");
+      for (size_t i = 0; i < this->dimensions_->size(); i++) {
+        this->elements_ *= (*this->dimensions_)[i];
+      }
+
+      size_t size = this->elements_ * sizeof(T);
+      CUDA_CALL(cudaMallocHost((void**)&this->data_,size));
+    }
+
+    virtual void deallocate_memory()
+    {
+      if (this->data_) {
+        CUDA_CALL(cudaFreeHost(this->data_));
+        this->data_ = 0;
+      }
+    }
+  };
+}
diff --git a/toolboxes/core/gpu/hoCuNDArray_blas.cu b/toolboxes/core/gpu/hoCuNDArray_blas.cu
new file mode 100644
index 0000000..51947cf
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_blas.cu
@@ -0,0 +1,260 @@
+#include "hoCuNDArray_blas.h"
+#include "cuNDArray_blas.h"
+#include "complext.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+#define CUBLAS_CALL(fun) {cublasStatus_t err = fun; if (err != CUBLAS_STATUS_SUCCESS) {throw cuda_error(gadgetron_getCublasErrorString(err));}}
+
+  // These are defined in cuNDArray_blas.cu
+  //
+
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_axpy(cublasHandle_t hndl, int n, const T* a , const T* x , int incx,  T* y, int incy);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_dot(cublasHandle_t, int, const T*, int, const  T*, int, T*, bool cc = true);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_nrm2(cublasHandle_t, int, const T*, int, typename realType<T>::Type *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amax(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_amin(cublasHandle_t handle, int n,const T *x, int incx, int *result);
+  template<class T> EXPORTGPUCORE cublasStatus_t cublas_asum(cublasHandle_t handle, int n,const T *x, int incx, typename realType<T>::Type *result);
+
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray<T>* x, hoCuNDArray<T>* y )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*2*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    T* y_ptr = y->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    cuNDArray<T> cuY(&dims);
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+      CUDA_CALL(cudaMemcpy(cuY.get_data_ptr(),y_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      CUBLAS_CALL(cublas_axpy(cudaDeviceManager::Instance()->lockHandle(device), curSize,
+			      &a, cuX.get_data_ptr(), 1, cuY.get_data_ptr(), 1));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+    
+      CUDA_CALL(cudaMemcpy(y_ptr,cuY.get_data_ptr(),curSize*sizeof(T),cudaMemcpyDeviceToHost));
+      remaining -= batchSize;
+    }
+  }
+
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray< complext<T> >*x, hoCuNDArray< complext<T> > *y )
+  {
+    axpy( complext<T>(a), x, y );
+  }
+
+  template<class T> EXPORTGPUCORE T dot( hoCuNDArray<T> *x, hoCuNDArray<T> *y, bool cc )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*2*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    T* y_ptr = y->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    cuNDArray<T> cuY(&dims);
+    T ret = T(0);
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+    
+      size_t curSize = std::min(batchSize,remaining);
+
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+      CUDA_CALL(cudaMemcpy(cuY.get_data_ptr(),y_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      T cur_ret;
+      CUBLAS_CALL(cublas_dot( cudaDeviceManager::Instance()->lockHandle(device), curSize,
+			      cuX.get_data_ptr(), 1,
+			      cuY.get_data_ptr(), 1,
+			      &cur_ret, cc ));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      ret += cur_ret;
+    }
+    return ret;
+  }
+
+  template<class T> EXPORTGPUCORE typename realType<T>::Type nrm2( hoCuNDArray<T>* x )
+  {
+    typedef typename realType<T>::Type REAL;
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    REAL ret = 0;
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      REAL cur_ret;
+      CUBLAS_CALL(cublas_nrm2<T>( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+				  cuX.get_data_ptr(), 1, &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      ret += cur_ret*cur_ret;
+    }
+    return std::sqrt(ret);
+  }
+
+  template<class T> EXPORTGPUCORE typename realType<T>::Type asum( hoCuNDArray<T>* x )
+  {
+    typedef typename realType<T>::Type REAL;
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    REAL ret = 0;
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      REAL cur_ret;
+      CUBLAS_CALL(cublas_asum( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+			       cuX.get_data_ptr(), 1,
+			       &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      ret += cur_ret;
+    }
+    return ret;
+  }
+
+  template<class T> EXPORTGPUCORE size_t amin( hoCuNDArray<T>* x )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    std::vector<size_t> results;
+ 
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      int cur_ret;
+      CUBLAS_CALL(cublas_amin( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+			       cuX.get_data_ptr(), 1,
+			       &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      results.push_back(cur_ret+i*batchSize-1);
+    }
+
+    size_t res =0;
+    for (size_t i =0; i < results.size(); i++){
+      if (abs(x_ptr[results[i]]) < abs(x_ptr[res])) res = results[i];
+    }
+    return res;
+  }
+
+  template<class T> EXPORTGPUCORE size_t amax( hoCuNDArray<T>* x )
+  {
+    int device = cudaDeviceManager::Instance()->getCurrentDevice();
+    size_t free = cudaDeviceManager::Instance()->getFreeMemory(device);
+    size_t batchSize = 1024*1024*(free/(sizeof(T)*1024*1024)); //Ensure 1Mb allocations
+    size_t remaining = x->get_number_of_elements();
+    batchSize = std::min(batchSize,remaining);
+    T* x_ptr = x->get_data_ptr();
+    std::vector<size_t> dims;
+    dims.push_back(batchSize);
+    cuNDArray<T> cuX(&dims);
+    std::vector<size_t> results;
+
+    for (size_t i = 0; i < (x->get_number_of_elements()-1)/batchSize+1; i++){
+
+      size_t curSize = std::min(batchSize,remaining);
+      CUDA_CALL(cudaMemcpy(cuX.get_data_ptr(),x_ptr+i*batchSize,curSize*sizeof(T),cudaMemcpyHostToDevice));
+
+      int cur_ret;
+      CUBLAS_CALL(cublas_amax( cudaDeviceManager::Instance()->lockHandle(device), batchSize,
+			       cuX.get_data_ptr(), 1,
+			       &cur_ret));
+
+      cudaDeviceManager::Instance()->unlockHandle(device);
+
+      remaining -= batchSize;
+      results.push_back(cur_ret+i*batchSize-1);
+    }
+
+    size_t res =0;
+    for (size_t i =0; i < results.size(); i++){
+      if (abs(x_ptr[results[i]]) > abs(x_ptr[res])) res = results[i];
+    }
+    return res;
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE float dot(hoCuNDArray<float>*,hoCuNDArray<float>*,bool);
+  template EXPORTGPUCORE float nrm2(hoCuNDArray<float>*);
+  template EXPORTGPUCORE void axpy(float,hoCuNDArray<float>*,hoCuNDArray<float>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<float>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<float>*);
+  template EXPORTGPUCORE float asum(hoCuNDArray<float>*);
+
+  template EXPORTGPUCORE double dot(hoCuNDArray<double>*,hoCuNDArray<double>*,bool);
+  template EXPORTGPUCORE double nrm2(hoCuNDArray<double>*);
+  template EXPORTGPUCORE void axpy(double,hoCuNDArray<double>*,hoCuNDArray<double>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<double>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<double>*);
+  template EXPORTGPUCORE double asum(hoCuNDArray<double>*);
+
+  template EXPORTGPUCORE float_complext dot(hoCuNDArray<float_complext>*,hoCuNDArray<float_complext>*,bool);
+  template EXPORTGPUCORE float nrm2(hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float_complext,hoCuNDArray<float_complext>*,hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE void axpy(float,hoCuNDArray<float_complext>*,hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<float_complext>*);
+  template EXPORTGPUCORE float asum(hoCuNDArray<float_complext>*);
+
+  template EXPORTGPUCORE double_complext dot(hoCuNDArray<double_complext>*,hoCuNDArray<double_complext>*,bool);
+  template EXPORTGPUCORE double nrm2(hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double_complext,hoCuNDArray<double_complext>*,hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE void axpy(double,hoCuNDArray<double_complext>*,hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amin(hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE size_t amax(hoCuNDArray<double_complext>*);
+  template EXPORTGPUCORE double asum(hoCuNDArray<double_complext>*);
+}
diff --git a/toolboxes/core/gpu/hoCuNDArray_blas.h b/toolboxes/core/gpu/hoCuNDArray_blas.h
new file mode 100644
index 0000000..e8e44b0
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_blas.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "cudaDeviceManager.h"
+#include "gpucore_export.h"
+
+namespace Gadgetron{
+
+  template<class T> EXPORTGPUCORE T dot( hoCuNDArray<T> *x, hoCuNDArray<T> *y, bool cc = true );
+  
+  template<class T> EXPORTGPUCORE typename realType<T>::Type nrm2( hoCuNDArray<T> *x );
+  
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray<T> *x, hoCuNDArray<T> *y );
+  template<class T> EXPORTGPUCORE void axpy( T a, hoCuNDArray< complext<T> > *x, hoCuNDArray< complext<T> > *y );
+  
+  /**
+   * @brief Gets the index of the index of the element with minimum absolute
+   * @param x Input data
+   * @return index of absolute minimum values
+   */
+  template<class T> EXPORTGPUCORE size_t amin( hoCuNDArray<T> *x);
+  
+  /**
+   * @brief Gets the index of the index of the element with maximum absolute
+   * @param x Input data
+   * @return index of absolute maximum values
+   * @details Note that this returns the C-style index and NOT the Fortran index.
+   */
+  template<class T> EXPORTGPUCORE size_t amax( hoCuNDArray<T> *x );
+  
+  template<class T> EXPORTGPUCORE typename realType<T>::Type asum( hoCuNDArray<T> *x );
+}
diff --git a/toolboxes/core/gpu/hoCuNDArray_elemwise.h b/toolboxes/core/gpu/hoCuNDArray_elemwise.h
new file mode 100644
index 0000000..ba3aaa8
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_elemwise.h
@@ -0,0 +1,8 @@
+/**
+ * \file hoCuNDArray_elemwise.h
+ * \brief Element-wise math operations on the hoCuNDArray class.     For now just delegates everything to hoNDArray operators.
+ */
+
+#pragma once
+
+#include "hoNDArray_elemwise.h"
diff --git a/toolboxes/core/gpu/hoCuNDArray_math.h b/toolboxes/core/gpu/hoCuNDArray_math.h
new file mode 100644
index 0000000..36ae858
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_math.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "hoCuNDArray_blas.h"
+#include "hoCuNDArray_elemwise.h"
+#include "hoCuNDArray_operators.h"
+#include "hoCuNDArray_utils.h"
diff --git a/toolboxes/core/gpu/hoCuNDArray_operators.h b/toolboxes/core/gpu/hoCuNDArray_operators.h
new file mode 100644
index 0000000..d2daa11
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_operators.h
@@ -0,0 +1,9 @@
+/**
+ * \file hoCuNDArray_operators.h
+ * \brief Operators on the hoCuNDArray class. For now just delegates everything to hoNDArray operators.
+ */
+
+#pragma once
+
+#include "hoNDArray_operators.h"
+
diff --git a/toolboxes/core/gpu/hoCuNDArray_utils.h b/toolboxes/core/gpu/hoCuNDArray_utils.h
new file mode 100644
index 0000000..310a805
--- /dev/null
+++ b/toolboxes/core/gpu/hoCuNDArray_utils.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "hoNDArray_utils.h"
+#include "complext.h"
+
+namespace Gadgetron{
+
+  /**
+   * @brief Calculates the elementwise absolute value of the array
+   * @param[in] data Input data
+   * @return A new array containing the elementwise absolute value of data
+   */
+  template<class T>
+  boost::shared_ptr<hoCuNDArray<typename realType<T>::type> > abs(hoCuNDArray<T> *data){
+    return boost::static_pointer_cast<hoCuNDArray<typename realType<T>::type> >(abs(static_cast<hoNDArray<T>* >(data)));
+  }
+}
diff --git a/toolboxes/core/gpu/radial_utilities.cu b/toolboxes/core/gpu/radial_utilities.cu
new file mode 100644
index 0000000..945dce4
--- /dev/null
+++ b/toolboxes/core/gpu/radial_utilities.cu
@@ -0,0 +1,427 @@
+#include "radial_utilities.h"
+#include "vector_td_operators.h"
+#include "vector_td_utilities.h"
+#include "real_utilities.h"
+#include "real_utilities_device.h"
+#include "check_CUDA.h"
+
+#include <math_constants.h>
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE> __inline__ __device__ REAL get_angle_step_GR();
+
+  template<> __inline__ __device__ float get_angle_step_GR<float,0>(){ return CUDART_PI_F*(3.0f-::sqrtf(5.0f))*0.5f; }   // GR_SMALLEST
+  template<> __inline__ __device__ float get_angle_step_GR<float,1>(){ return CUDART_PI_F/((::sqrtf(5.0f)+1.0f)*0.5f); } // GR_ORIGINAL
+  template<> __inline__ __device__ double get_angle_step_GR<double,0>(){ return CUDART_PI*(3.0-::sqrt(5.0))*0.5; }       // GR_SMALLEST
+  template<> __inline__ __device__ double get_angle_step_GR<double,1>(){ return CUDART_PI/((::sqrt(5.0)+1.0)*0.5); }     // GR_ORIGINAL
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE> __global__ void
+  compute_radial_trajectory_golden_ratio_2d_kernel( typename reald<REAL,2>::Type *co, REAL angular_offset )
+  {
+    const unsigned int index = blockIdx.x*blockDim.x + threadIdx.x;              
+
+    const REAL samples_per_profile = (REAL) blockDim.x;
+    const REAL bias = samples_per_profile * REAL(0.5);
+    const REAL sample_idx_on_profile = (REAL)threadIdx.x;
+    const REAL profile = (REAL)blockIdx.x;
+    const REAL angle_step = get_angle_step_GR<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE>();
+
+    REAL cos_angle, sin_angle;
+    gad_sincos<REAL>( (profile+angular_offset)*angle_step+get_pi<REAL>(), &sin_angle, &cos_angle );
+
+    typename reald<REAL,2>::Type sample_pos; 
+    sample_pos.vec[0] = (sample_idx_on_profile-bias)*cos_angle/samples_per_profile;
+    sample_pos.vec[1] = (sample_idx_on_profile-bias)*sin_angle/samples_per_profile;
+  
+    co[index] = sample_pos;
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_golden_ratio_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, 
+                                             unsigned int num_frames, unsigned int profile_offset, GOLDEN_RATIO_ANGULAR_STEP_SIZE mode )
+  {
+    typedef typename reald<REAL,2>::Type T;
+  
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+  
+    if( num_samples_per_profile%warp_size ){
+      cout << endl << "compute_radial_trajectory_golden_ratio_2d: #samples/profile is not a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+
+    unsigned int number_of_samples_per_frame = num_samples_per_profile * num_profiles_per_frame;
+
+    // Allocate space for result
+    vector<size_t> dims; dims.push_back( number_of_samples_per_frame ); dims.push_back( num_frames );
+    boost::shared_ptr< cuNDArray<T> > co( new cuNDArray<T>(&dims) );
+  
+    if(!co.get()){
+      cout << endl << "Error:: compute_radial_trajectory_golden_ratio_2d: memory allocation failed." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+  
+    // Set dimensions of grid/blocks.
+    dim3 dimBlock( num_samples_per_profile );
+    dim3 dimGrid( num_profiles_per_frame*num_frames );
+  
+    // Invoke kernel (nvcc has been protesting heavily on various other ways to do this...)
+    if( mode == GR_SMALLEST )
+      compute_radial_trajectory_golden_ratio_2d_kernel<REAL,0><<< dimGrid, dimBlock >>> 
+        ( co->get_data_ptr(), (REAL)profile_offset );
+    else
+      compute_radial_trajectory_golden_ratio_2d_kernel<REAL,1><<< dimGrid, dimBlock >>> 
+        ( co->get_data_ptr(), (REAL)profile_offset );
+    
+    CHECK_FOR_CUDA_ERROR();
+  
+    return co;
+  }
+
+  template<class REAL> __global__ void
+  compute_radial_trajectory_fixed_angle_2d_kernel( typename reald<REAL,2>::Type *co, REAL one_over_num_profiles_per_frame, REAL one_over_num_frames, REAL angular_offset )
+  {
+    const unsigned int index = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+
+    const REAL samples_per_profile = (REAL) blockDim.x;
+    const REAL bias = samples_per_profile * REAL(0.5);
+    const REAL sample_idx_on_profile = (REAL)threadIdx.x;
+    const REAL lprofile = (REAL)blockIdx.x;
+    const REAL frame = (REAL)blockIdx.y;
+
+    REAL cos_angle, sin_angle;
+    gad_sincos<REAL>( (lprofile+frame*one_over_num_frames)*one_over_num_profiles_per_frame*get_pi<REAL>()+angular_offset+get_pi<REAL>(), &sin_angle, &cos_angle );
+
+    typename reald<REAL,2>::Type sample_pos; 
+    sample_pos.vec[0] = (sample_idx_on_profile-bias)*cos_angle/samples_per_profile;
+    sample_pos.vec[1] = (sample_idx_on_profile-bias)*sin_angle/samples_per_profile;
+  
+    co[index] = sample_pos;
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > > 
+  compute_radial_trajectory_fixed_angle_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, unsigned int num_frames, REAL angular_offset )
+  {
+    typedef typename reald<REAL,2>::Type T;
+  
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+  
+    if( num_samples_per_profile%warp_size ){
+      cout << endl << "Error:: compute_radial_trajectory_fixed_angle_2d: #samples/profile is not a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<T> >();
+    }
+
+    unsigned int number_of_samples_per_frame = num_samples_per_profile * num_profiles_per_frame;
+
+    // Allocate space for result
+    vector<size_t> dims; 
+    dims.push_back( number_of_samples_per_frame ); 
+    dims.push_back( num_frames );
+  
+    boost::shared_ptr< cuNDArray<T> > co( new cuNDArray<T>(&dims) );
+  
+    // Set dimensions of grid/blocks.
+    dim3 dimBlock( num_samples_per_profile );
+    dim3 dimGrid( num_profiles_per_frame, num_frames );
+  
+    // Invoke kernel
+    compute_radial_trajectory_fixed_angle_2d_kernel<REAL><<< dimGrid, dimBlock >>> ( co->get_data_ptr(), REAL(1)/(REAL)num_profiles_per_frame, REAL(1)/(REAL)num_frames, angular_offset );
+  
+    CHECK_FOR_CUDA_ERROR();
+  
+    return co;
+  }
+
+  // Find the (eight) neighbors to a given radial sample index
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE, bool GR> 
+  __inline__ __device__ typename reald<REAL,2>::Type
+  compute_radial_neighbors( REAL sample_idx_on_profile, REAL angular_offset, REAL alpha, 
+                            REAL one_over_radial_oversampling_factor, REAL one_over_num_profiles,
+                            REAL bias, REAL samples_per_profile, REAL profile, REAL num_profiles,
+                            typename reald<REAL,2>::Type *p1, typename reald<REAL,2>::Type *p2, 
+                            typename reald<REAL,2>::Type *p3, typename reald<REAL,2>::Type *p4,
+                            typename reald<REAL,2>::Type *p5, typename reald<REAL,2>::Type *p6, 
+                            typename reald<REAL,2>::Type *p7, typename reald<REAL,2>::Type *p8  )
+  {
+    // The sample positions (scales) can be either of the _local_ indices 'sample_idx_on_profile' or 'samples_per_projection'-'sample_idx_on_profile'
+    // Beware of "skewness" around the origin, i.e. +1 sample one one side
+    const REAL ctr_scale       = alpha*((sample_idx_on_profile-bias)*one_over_radial_oversampling_factor);
+    const REAL ctr_scale_inv   = alpha*((samples_per_profile-sample_idx_on_profile-bias)*one_over_radial_oversampling_factor);
+    const REAL prev_scale      = alpha*((sample_idx_on_profile-bias-1)*one_over_radial_oversampling_factor);
+    const REAL prev_scale_inv  = alpha*((samples_per_profile-(sample_idx_on_profile-1)-bias)*one_over_radial_oversampling_factor);
+    const REAL next_scale      = alpha*((sample_idx_on_profile-bias+1)*one_over_radial_oversampling_factor);
+    const REAL next_scale_inv  = alpha*((samples_per_profile-(sample_idx_on_profile+1)-bias)*one_over_radial_oversampling_factor);
+  
+    // Unit circle position for current projection
+    REAL cos_angle, sin_angle;
+  
+    switch(GR){
+    
+    case true: // golden ratio
+      {
+        const REAL angle_step = get_angle_step_GR<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE>();
+        gad_sincos<REAL>( (profile+angular_offset)*angle_step, &sin_angle, &cos_angle );
+      }
+      break;	  
+    case false: // fixed angle
+      {
+        gad_sincos<REAL>( profile*one_over_num_profiles*get_pi<REAL>(), &sin_angle, &cos_angle );	}
+      break;
+    }
+  
+    // Find the normal to the current projection direction
+    typename reald<REAL,2>::Type normal; normal.vec[0] = -sin_angle; normal.vec[1] = cos_angle;
+  
+    // The position of the idx itself
+    typename reald<REAL,2>::Type sample_pos; sample_pos.vec[0] = ctr_scale*cos_angle; sample_pos.vec[1] = ctr_scale*sin_angle;
+  
+    // The positions of the previous and next sample
+    (*p1).vec[0] = prev_scale*cos_angle; (*p1).vec[1] = prev_scale*sin_angle;
+    (*p2).vec[0] = next_scale*cos_angle; (*p2).vec[1] = next_scale*sin_angle;
+  
+    // Initialize remaining points;
+    (*p3).vec[0] = (*p4).vec[0] = (*p5).vec[0] = (*p6).vec[0] = (*p7).vec[0] = (*p8).vec[0] = 
+      (*p3).vec[1] = (*p4).vec[1] = (*p5).vec[1] = (*p6).vec[1] = (*p7).vec[1] = (*p8).vec[1] = get_max<REAL>(); // far away...
+  
+    // Run through all projections to find the closests neighbors
+  
+    for( unsigned int i=0; i<num_profiles; i++ ){
+    
+      if( i == profile )
+        continue;
+    
+      // Unit circle position projection 'i'
+      switch(GR)
+        {
+        case true:
+          {
+            const REAL angle_step = get_angle_step_GR<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE>();
+            gad_sincos<REAL>( ((REAL)i+angular_offset)*angle_step, &sin_angle, &cos_angle );
+          }
+          break;
+
+        case false:
+          {
+            gad_sincos<REAL>( (REAL)i*one_over_num_profiles*get_pi<REAL>(), &sin_angle, &cos_angle );
+          }
+          break;	
+        }
+
+      // Determine sample positions on projection
+      typename reald<REAL,2>::Type prev_pos_1;  prev_pos_1.vec[0] = prev_scale*cos_angle;      prev_pos_1.vec[1] = prev_scale*sin_angle;
+      typename reald<REAL,2>::Type prev_pos_2;  prev_pos_2.vec[0] = prev_scale_inv*cos_angle;  prev_pos_2.vec[1] = prev_scale_inv*sin_angle;
+      typename reald<REAL,2>::Type ctr_pos_1;   ctr_pos_1.vec[0]  = ctr_scale*cos_angle;       ctr_pos_1.vec[1]  = ctr_scale*sin_angle;
+      typename reald<REAL,2>::Type ctr_pos_2;   ctr_pos_2.vec[0]  = ctr_scale_inv*cos_angle;   ctr_pos_2.vec[1]  = ctr_scale_inv*sin_angle;
+      typename reald<REAL,2>::Type next_pos_1;  next_pos_1.vec[0] = next_scale*cos_angle;      next_pos_1.vec[1] = next_scale*sin_angle;
+      typename reald<REAL,2>::Type next_pos_2;  next_pos_2.vec[0] = next_scale_inv*cos_angle;  next_pos_2.vec[1] = next_scale_inv*sin_angle;
+    
+      // The dot product is used to ensure we find a neighbor on each side
+      if( dot<REAL,2>(ctr_pos_1-sample_pos, normal) > REAL(0) ){
+    
+        if( norm_squared<REAL>(ctr_pos_1-sample_pos) < norm_squared<REAL>(*p4-sample_pos) ){
+          *p3 = prev_pos_1;
+          *p4 = ctr_pos_1;
+          *p5 = next_pos_1;
+        }
+      }
+      else{
+     
+        if( norm_squared<REAL>(ctr_pos_1-sample_pos) < norm_squared<REAL>(*p7-sample_pos) ){
+          *p6 = prev_pos_1;
+          *p7 = ctr_pos_1;
+          *p8 = next_pos_1;
+        }
+      }
+  
+      // The dot product is used to ensure we find a neighbor on each side
+      if( dot<REAL,2>(ctr_pos_2-sample_pos, normal) >  REAL(0) ){
+  
+        if( norm_squared<REAL>(ctr_pos_2-sample_pos) < norm_squared<REAL>(*p4-sample_pos) ){
+          *p3 = prev_pos_2;
+          *p4 = ctr_pos_2;
+          *p5 = next_pos_2;
+        }
+      }
+      else{
+      
+        if( norm_squared<REAL>(ctr_pos_2-sample_pos) < norm_squared<REAL>(*p7-sample_pos) ){
+          *p6 = prev_pos_2;
+          *p7 = ctr_pos_2;
+          *p8 = next_pos_2;
+        }
+      }
+    }
+  
+    return sample_pos;
+  }
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE, bool GR> __global__ void
+  compute_radial_dcw_2d_kernel( REAL alpha, REAL one_over_radial_oversampling_factor, REAL one_over_num_profiles, REAL angular_offset, REAL *dcw )
+  {
+    const REAL samples_per_profile = (REAL) (blockDim.x<<1);
+    const REAL sample_idx_on_profile = (REAL)(blockIdx.x*blockDim.x+threadIdx.x);
+    const REAL num_profiles = (REAL)gridDim.y;
+    const REAL profile = (REAL)blockIdx.y;
+    const REAL bias = samples_per_profile*REAL(0.5);
+
+    const unsigned int index = blockIdx.y*samples_per_profile + sample_idx_on_profile;
+  
+    REAL weight;
+  
+    if( sample_idx_on_profile == blockDim.x ){
+
+      // Special case - center of profile/k-space
+      const REAL radius = (alpha*one_over_radial_oversampling_factor)*REAL(0.5);
+      const REAL area = radius*radius*get_pi<REAL>();
+      weight = area/num_profiles;
+    }
+    else{
+    
+      // General case - all neighbors exist
+    
+      // Compute sample positions for the current sample and all neighbors
+      // The ordering of p1..p8 in the call below follows the edge of the "Voronoi polygon"
+    
+      typename reald<REAL,2>::Type sample_pos;
+      typename reald<REAL,2>::Type p1, p2, p3, p4, p5, p6, p7, p8;
+    
+      sample_pos = compute_radial_neighbors<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE,GR>
+        ( sample_idx_on_profile, angular_offset, alpha, 
+          one_over_radial_oversampling_factor, one_over_num_profiles, bias, samples_per_profile, profile, num_profiles,
+          &p1, &p5, &p2, &p3, &p4, &p8, &p7, &p6 );
+    
+      // Find midpoints of lines from sample_pos to all other points.
+      p1 = REAL(0.5)*(sample_pos+p1); // computing "sample_pos+(p1-sample_pos)/2"
+      p2 = REAL(0.5)*(sample_pos+p2);
+      p3 = REAL(0.5)*(sample_pos+p3);
+      p4 = REAL(0.5)*(sample_pos+p4);
+      p5 = REAL(0.5)*(sample_pos+p5);
+      p6 = REAL(0.5)*(sample_pos+p6);
+      p7 = REAL(0.5)*(sample_pos+p7);
+      p8 = REAL(0.5)*(sample_pos+p8);
+    
+      // The weight is determined by the area of the polygon (http://local.wasp.uwa.edu.au/~pbourke/geometry/polyarea/)
+      weight = REAL(0.5)*
+        ((p1.vec[0]*p2.vec[1]-p2.vec[0]*p1.vec[1])+
+         (p2.vec[0]*p3.vec[1]-p3.vec[0]*p2.vec[1])+
+         (p3.vec[0]*p4.vec[1]-p4.vec[0]*p3.vec[1])+
+         (p4.vec[0]*p5.vec[1]-p5.vec[0]*p4.vec[1])+
+         (p5.vec[0]*p6.vec[1]-p6.vec[0]*p5.vec[1])+
+         (p6.vec[0]*p7.vec[1]-p7.vec[0]*p6.vec[1])+
+         (p7.vec[0]*p8.vec[1]-p8.vec[0]*p7.vec[1])+
+         (p8.vec[0]*p1.vec[1]-p1.vec[0]*p8.vec[1]));                        
+    
+      if( weight<REAL(0) ) weight *= -REAL(1);
+    }
+  
+    dcw[index] = weight;
+  }
+
+  template<class REAL, unsigned int GOLDEN_RATIO_ANGULAR_STEP_SIZE, bool GR> boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_2d( unsigned int samples_per_profile, unsigned int num_profiles, 
+                         REAL alpha, REAL one_over_radial_oversampling_factor, unsigned int profile_offset = 0 )
+  {
+    if( num_profiles < 4 ){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: use at least four profiles" << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+  
+    // Get device properties
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+    const unsigned int warp_size = deviceProp.warpSize;
+  
+    if( samples_per_profile%2 ){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: samples/profile must be even." << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+
+    if( samples_per_profile%warp_size ){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: samples/profile number a multiple of the device's warp size." << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+
+    unsigned int number_of_samples = samples_per_profile * num_profiles;
+  
+    // Allocate space for result
+    vector<size_t> dims; dims.push_back( number_of_samples );
+    boost::shared_ptr< cuNDArray<REAL> > dcw( new cuNDArray<REAL>(&dims) );
+  
+    if(!dcw.get()){
+      cout << endl << "Error:: compute_radial_dcw_<*>_2d: memory allocation failed." << endl;
+      return boost::shared_ptr< cuNDArray<REAL> >();
+    }
+  
+    // Set dimensions of grid/blocks. (division by two due to resource limitations)
+    dim3 dimBlock( samples_per_profile>>1 );
+    dim3 dimGrid( 2, num_profiles );
+  
+    // Invoke kernel
+    compute_radial_dcw_2d_kernel<REAL,GOLDEN_RATIO_ANGULAR_STEP_SIZE,GR><<< dimGrid, dimBlock >>> 
+      ( alpha, one_over_radial_oversampling_factor, REAL(1)/(REAL)num_profiles, (REAL)profile_offset, dcw->get_data_ptr() );
+  
+    CHECK_FOR_CUDA_ERROR();
+  
+    return dcw;
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_golden_ratio_2d( unsigned int samples_per_profile, unsigned int num_profiles, 
+                                      REAL alpha, REAL one_over_radial_oversampling_factor, unsigned int profile_offset,
+                                      GOLDEN_RATIO_ANGULAR_STEP_SIZE mode)
+  {
+    if( mode == GR_SMALLEST )
+      return compute_radial_dcw_2d<REAL,0,true>
+        ( samples_per_profile, num_profiles, alpha, one_over_radial_oversampling_factor, profile_offset );
+    else if( mode == GR_ORIGINAL )
+      return compute_radial_dcw_2d<REAL,1,true>
+        ( samples_per_profile, num_profiles, alpha, one_over_radial_oversampling_factor, profile_offset );
+    else
+      throw std::runtime_error("\ncompute_radial_dcw_golden_ratio_2d() :: unexpected mode\n");
+  }
+
+  template<class REAL> boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_fixed_angle_2d( unsigned int samples_per_profile, unsigned int num_profiles, 
+                                     REAL alpha, REAL one_over_radial_oversampling_factor )
+  {
+    // The golden ratio template type is ignored when the tailing template argument is false
+    return compute_radial_dcw_2d<REAL,GR_ORIGINAL,false>
+      ( samples_per_profile, num_profiles, alpha, one_over_radial_oversampling_factor );
+  }
+
+  //
+  // Instantiation
+  //
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<float,2>::Type > > 
+  compute_radial_trajectory_fixed_angle_2d<float>( unsigned int, unsigned int, unsigned int, float );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<double,2>::Type > > 
+  compute_radial_trajectory_fixed_angle_2d<double>( unsigned int, unsigned int, unsigned int, double );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<float,2>::Type > > 
+  compute_radial_trajectory_golden_ratio_2d<float>( unsigned int, unsigned int, unsigned int, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<double,2>::Type > > 
+  compute_radial_trajectory_golden_ratio_2d<double>( unsigned int, unsigned int, unsigned int, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> >compute_radial_dcw_fixed_angle_2d<float>( unsigned int, unsigned int, float, float);
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> >compute_radial_dcw_fixed_angle_2d<double>( unsigned int, unsigned int, double, double );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<float> >
+  compute_radial_dcw_golden_ratio_2d<float>( unsigned int, unsigned int, float, float, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+
+  template EXPORTGPUCORE boost::shared_ptr< cuNDArray<double> >
+  compute_radial_dcw_golden_ratio_2d<double>( unsigned int, unsigned int, double, double, unsigned int, GOLDEN_RATIO_ANGULAR_STEP_SIZE );
+}
diff --git a/toolboxes/core/gpu/radial_utilities.h b/toolboxes/core/gpu/radial_utilities.h
new file mode 100644
index 0000000..f9b2c1f
--- /dev/null
+++ b/toolboxes/core/gpu/radial_utilities.h
@@ -0,0 +1,37 @@
+#pragma once
+#include "gpucore_export.h"
+
+#include "cuNDArray.h"
+#include "vector_td.h"
+
+#include <boost/smart_ptr.hpp>
+
+namespace Gadgetron{
+
+  enum GOLDEN_RATIO_ANGULAR_STEP_SIZE {
+    GR_SMALLEST = 0, // 180*(3-sqrt(5.0))/2.0    = 68.7539 degrees
+    GR_ORIGINAL = 1  // 180/(sqrtf(5.0)+1.0)/2.0 = 111,2461 degrees 
+  };
+
+  // Compute fixed angle radial trajectory in the normalized range [-1/2;1/2]
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_fixed_angle_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, 
+                                            unsigned int num_frames, REAL angular_offset = REAL(0) );
+
+  // Compute golden ratio radial trajectory in the normalized range [-1/2;1/2]
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray< typename reald<REAL,2>::Type > >
+  compute_radial_trajectory_golden_ratio_2d( unsigned int num_samples_per_profile, unsigned int num_profiles_per_frame, 
+                                             unsigned int num_frames, 
+                                             unsigned int profile_offset = 0, GOLDEN_RATIO_ANGULAR_STEP_SIZE = GR_ORIGINAL );
+
+  // Compute fixed angle radial density compensation weights (a function of the chose reconstruction settings: matrix_size and oversampling factor)
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_fixed_angle_2d( unsigned int num_samples_per_profile, unsigned int num_profiles, 
+                                     REAL alpha, REAL one_over_radial_oversampling_factor);
+
+  // Compute golden ratio radial density compensation weights (a function of the chose reconstruction settings: matrix_size and oversampling factor)
+  template<class REAL> EXPORTGPUCORE boost::shared_ptr< cuNDArray<REAL> >
+  compute_radial_dcw_golden_ratio_2d( unsigned int num_samples_per_profile, unsigned int num_profiles, 
+                                      REAL alpha, REAL one_over_radial_oversampling_factor, 
+                                      unsigned int profile_offset = 0, GOLDEN_RATIO_ANGULAR_STEP_SIZE = GR_ORIGINAL );
+}
diff --git a/toolboxes/core/gpu/real_utilities_device.h b/toolboxes/core/gpu/real_utilities_device.h
new file mode 100644
index 0000000..57059e1
--- /dev/null
+++ b/toolboxes/core/gpu/real_utilities_device.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <math_constants.h>
+#include <math_functions.h>
+
+//
+// Math prototypes
+//
+
+template<class REAL> __inline__ __host__ __device__ void gad_sincos( REAL angle, REAL *a, REAL *b );
+template<class REAL> __inline__ __host__ __device__ REAL gad_rsqrt( REAL val );
+
+
+//
+// Implementation
+//
+
+template<> __inline__ __host__ __device__ void gad_sincos<float>( float angle, float *a, float *b ){ sincosf(angle, a,b); }
+template<> __inline__ __host__ __device__ void gad_sincos<double>( double angle, double *a, double *b ){ sincos(angle, a,b); }
+
+template<> __inline__ __host__ __device__ float gad_rsqrt<float>( float val ){ return rsqrtf(val); }
+template<> __inline__ __host__ __device__ double gad_rsqrt<double>( double val ){ return rsqrt(val); }
diff --git a/toolboxes/core/gpu/setup_grid.h b/toolboxes/core/gpu/setup_grid.h
new file mode 100644
index 0000000..34b402c
--- /dev/null
+++ b/toolboxes/core/gpu/setup_grid.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "cudaDeviceManager.h"
+#include "GadgetronCuException.h"
+
+namespace Gadgetron {
+
+  inline 
+  void setup_grid( unsigned int number_of_elements, dim3 *blockDim, dim3* gridDim, unsigned int num_batches = 1 )
+  {    
+    int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+    int maxGridDim = cudaDeviceManager::Instance()->max_griddim(cur_device);
+    int maxBlockDim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+
+    // The default one-dimensional block dimension is...
+    *blockDim = dim3(256);
+    *gridDim = dim3((number_of_elements+blockDim->x-1)/blockDim->x, num_batches);
+
+    // Extend block/grid dimensions if we exceeded the maximum grid dimension
+    if( gridDim->x > maxGridDim){
+      blockDim->x = maxBlockDim;
+      gridDim->x = (number_of_elements+blockDim->x-1)/blockDim->x;
+    }
+
+    if( gridDim->x > maxGridDim ){
+      gridDim->x = (unsigned int)std::floor(std::sqrt(float(number_of_elements)/float(blockDim->x)));
+      unsigned int num_elements_1d = blockDim->x*gridDim->x;
+      gridDim->y *= ((number_of_elements+num_elements_1d-1)/num_elements_1d);
+    }
+
+    if( gridDim->x > maxGridDim || gridDim->y > maxGridDim){
+      // If this ever becomes an issue, there is an additional grid dimension to explore for compute models >= 2.0.
+      throw cuda_error("setup_grid(): too many elements requested.");
+    }
+  }
+}
diff --git a/toolboxes/core/real_utilities.h b/toolboxes/core/real_utilities.h
new file mode 100644
index 0000000..66aaf7d
--- /dev/null
+++ b/toolboxes/core/real_utilities.h
@@ -0,0 +1,72 @@
+/** \file real_utilities.h
+    \brief A simple template based interface to some common C float/double constants to ease writing of templated code.
+*/
+
+#pragma once
+
+#include "core_defines.h"
+
+#ifdef _USE_MATH_DEFINES
+#include <math.h>
+#else
+#define _USE_MATH_DEFINES
+#include <math.h>
+#undef _USE_MATH_DEFINES
+#endif
+
+#include <float.h>
+
+//
+// Get scalar limits of operation
+//
+
+template<class T> __inline__ __host__ __device__ T get_min();
+template<class T> __inline__ __host__ __device__ T get_max();
+template<class T> __inline__ __host__ __device__ T get_epsilon();
+
+//
+// Math prototypes
+//
+
+template<class REAL> __inline__ __device__ REAL get_pi();
+
+//
+// Implementation
+//
+
+template<> __inline__ __host__ __device__ float get_min<float>()
+{
+  return FLT_MIN;
+}
+
+template<> __inline__ __host__ __device__ double get_min<double>()
+{
+  return DBL_MIN;
+}
+
+template<> __inline__ __host__ __device__ float get_max<float>()
+{
+  return FLT_MAX;
+}
+
+template<> __inline__ __host__ __device__ double get_max<double>()
+{
+  return DBL_MAX;
+}
+
+template<> __inline__ __host__ __device__ float get_epsilon<float>()
+{
+  return FLT_EPSILON;
+}
+
+template<> __inline__ __host__ __device__ double get_epsilon<double>()
+{
+  return DBL_EPSILON;
+}
+
+template<> __inline__ __host__ __device__ float get_pi(){ return (float)M_PI; }
+template<> __inline__ __host__ __device__ double get_pi(){ return M_PI; }
+
+template <typename T> __inline__ __host__ __device__ int sgn(T val) {
+    return (T(0) < val) - (val < T(0));
+}
diff --git a/toolboxes/core/vector_td.h b/toolboxes/core/vector_td.h
new file mode 100644
index 0000000..0205a85
--- /dev/null
+++ b/toolboxes/core/vector_td.h
@@ -0,0 +1,293 @@
+/** \file vector_td.h
+    \brief The class vector_td defines a D-dimensional vector of type T.
+
+    The class vector_td defines a D-dimensional vector of type T.
+    It is used in the Gadgetron to represent short vectors.
+    I.e. it is purposedly templetated with dimensionality D as type unsigned int instead of size_t.
+    For larger vectors consider using the NDArray class instead (or a std::vector).
+    The vector_td class can be used on both the cpu and gpu.
+    The accompanying headers vector_td_opeators.h and vector_td_utilities.h define most of the functionality.
+    Note that vector_td should not be used to represent complex numbers. For that we provide the custom class complext instead.
+*/
+
+#pragma once
+
+#include "core_defines.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class vector_td
+  {
+  public:
+
+    T vec[D];
+     __inline__ __host__ __device__ vector_td(const vector_td & other){
+       	for (unsigned int i = 0; i < D; i++)
+           	vec[i] = other[i];
+        }
+
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,D> & other){
+    	for (unsigned int i = 0; i < D; i++)
+        	vec[i] = (T) other[i];
+     }
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+    	for (unsigned int i = 0; i < D; i++)
+               	vec[i] = x;
+		 }
+    __inline__ __host__ __device__ T& operator[](const unsigned int i)
+    {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const
+    {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i)
+    {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const
+    {
+      return vec[i];
+    }
+  };
+
+  //
+  // Some typedefs for convenience (templated typedefs are not (yet) available in C++)
+  //
+
+  template< class REAL, unsigned int D > struct reald{
+    typedef vector_td< REAL, D > Type;
+  };
+
+  template< unsigned int D > struct uintd{
+    typedef vector_td< unsigned int, D > Type;
+  };
+
+  template< unsigned int D > struct uint64d{
+    typedef vector_td< size_t, D > Type;
+  };
+
+  template< unsigned int D > struct intd{
+    typedef vector_td< int, D > Type;
+  };
+
+  template< unsigned int D > struct int64d{
+    typedef vector_td< long long, D > Type;
+  };
+
+  template< unsigned int D > struct floatd{
+    typedef typename reald< float, D >::Type Type;
+  };
+
+  template< unsigned int D > struct doubled{
+    typedef typename reald< double, D >::Type Type;
+  };
+
+  template<class T> class vector_td<T,1>
+  {
+  public:
+
+    T vec[1];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+					vec[0] = other[0];
+		 }
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,1> & other){
+    	vec[0] = (T) other[0];
+    }
+
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ vector_td(T x){ // Not explicit because we actually want to be able to do implicit conversions here.
+      vec[0]=x;
+    }
+
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  template<class T> class vector_td<T,2>
+  {
+  public:
+
+    T vec[2];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+      	for (unsigned int i = 0; i < 2; i++)
+          	vec[i] = other[i];
+		 }
+
+
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,2> & other){
+    	for (unsigned int i = 0; i < 2; i++)
+        	vec[i] = (T) other[i];
+     }
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ vector_td(T x, T y){
+      vec[0]=x;
+      vec[1]=y;
+    }
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+      vec[0]=x;
+      vec[1]=x;
+    }
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  template<class T> class vector_td<T,3>
+  {
+  public:
+
+    T vec[3];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+      	for (unsigned int i = 0; i < 3; i++)
+          	vec[i] = other[i];
+		 }
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,3> & other){
+    	for (unsigned int i = 0; i < 3; i++)
+        	vec[i] = (T) other[i];
+     }
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ vector_td(T x, T y,T z){
+      vec[0]=x;
+      vec[1]=y;
+      vec[2]=z;
+    }
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+      vec[0]=x;
+      vec[1]=x;
+      vec[2]=x;
+    }
+
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  template<class T> class vector_td<T,4>
+  {
+  public:
+
+    T vec[4];
+
+    __inline__ __host__ __device__ vector_td(const vector_td & other){
+    	for (unsigned int i = 0; i < 4; i++)
+        	vec[i] = other[i];
+     }
+    template<class T2> __inline__ __host__ __device__ explicit vector_td(const vector_td<T2,4> & other){
+    	for (unsigned int i = 0; i < 4; i++)
+        	vec[i] = (T) other[i];
+     }
+
+    __inline__ __host__ __device__ vector_td(){}
+
+    __inline__ __host__ __device__ vector_td(T x, T y,T z,T w){
+      vec[0]=x;
+      vec[1]=y;
+      vec[2]=z;
+      vec[3]=w;
+    }
+
+    __inline__ __host__ __device__ explicit vector_td(T x){
+      vec[0]=x;
+      vec[1]=x;
+      vec[2]=x;
+      vec[3]=x;
+    }
+
+    __inline__ __host__ __device__ T& operator[](const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator[](const unsigned int i) const {
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ T& operator()(const unsigned int i){
+      return vec[i];
+    }
+
+    __inline__ __host__ __device__ const T& operator()(const unsigned int i) const {
+      return vec[i];
+    }
+  };
+
+  typedef vector_td<unsigned int,1> uintd1;
+  typedef vector_td<unsigned int,2> uintd2;
+  typedef vector_td<unsigned int,3> uintd3;
+  typedef vector_td<unsigned int,4> uintd4;
+
+  typedef vector_td<size_t,1> uint64d1;
+  typedef vector_td<size_t,2> uint64d2;
+  typedef vector_td<size_t,3> uint64d3;
+  typedef vector_td<size_t,4> uint64d4;
+
+  typedef vector_td<int,1> intd1;
+  typedef vector_td<int,2> intd2;
+  typedef vector_td<int,3> intd3;
+  typedef vector_td<int,4> intd4;
+
+  typedef vector_td<long long,1> int64d1;
+  typedef vector_td<long long,2> int64d2;
+  typedef vector_td<long long,3> int64d3;
+  typedef vector_td<long long,4> int64d4;
+
+  typedef vector_td<float,1> floatd1;
+  typedef vector_td<float,2> floatd2;
+  typedef vector_td<float,3> floatd3;
+  typedef vector_td<float,4> floatd4;
+
+  typedef vector_td<double,1> doubled1;
+  typedef vector_td<double,2> doubled2;
+  typedef vector_td<double,3> doubled3;
+  typedef vector_td<double,4> doubled4;
+}
diff --git a/toolboxes/core/vector_td_io.h b/toolboxes/core/vector_td_io.h
new file mode 100644
index 0000000..a70cc94
--- /dev/null
+++ b/toolboxes/core/vector_td_io.h
@@ -0,0 +1,49 @@
+/** \file vector_td_io.h
+    \brief Basic iostream "communication" using the vector_td class
+*/
+
+#pragma once
+
+#include "vector_td.h"
+
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> ::std::ostream& operator<<(std::ostream& os, const vector_td<T,D>& vec) {
+    os <<'[' ;
+    for (int i = 0; i < D-1; i++) os << vec[i] << ", ";
+    return os << vec[D-1] <<']';
+  }
+
+  template<class T, unsigned int D> std::istream& operator>>(std::istream& is, vector_td<T,D>& vec) {
+    char tmp;
+    is.get(tmp);
+    if (tmp != '['){
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+
+    for (int i = 0; i < D-1; i++){
+      T val;
+      tmp = ' ';
+      is >> val;
+      vec[i]=val;
+      while (tmp == ' ') is.get(tmp);
+      if (tmp != ','){
+	is.setstate(std::ios::failbit);
+	return is;
+      }
+    }
+    tmp = ' ';
+    is >> vec[D-1];
+    while (tmp == ' ') is.get(tmp);
+    if (tmp != ']'){
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+    return is;
+  }
+}
diff --git a/toolboxes/core/vector_td_operators.h b/toolboxes/core/vector_td_operators.h
new file mode 100644
index 0000000..5048050
--- /dev/null
+++ b/toolboxes/core/vector_td_operators.h
@@ -0,0 +1,435 @@
+/** \file vector_td_operators.h
+    \brief Common operators for the vector_td class
+*/
+
+#pragma once
+
+#include "vector_td.h"
+#include "core_defines.h"
+
+namespace Gadgetron{
+
+  //
+  // Return types
+  //
+
+  template <class T, class I> struct vectorTDReturnType {};
+  template <class T> struct vectorTDReturnType<T,T> {typedef T type;};
+  template<> struct vectorTDReturnType<unsigned int, int> {typedef int type;};
+  template<> struct vectorTDReturnType<int, unsigned int> {typedef int type;};
+  template<> struct vectorTDReturnType<int, bool> {typedef int type;};
+  template<> struct vectorTDReturnType<bool,int> {typedef int type;};
+  template<> struct vectorTDReturnType<unsigned int, bool> {typedef int type;};
+  template<> struct vectorTDReturnType<bool,unsigned int> {typedef int type;};
+  template<> struct vectorTDReturnType<float, unsigned int> {typedef float type;};
+  template<> struct vectorTDReturnType<unsigned int, float> {typedef float type;};
+  template<> struct vectorTDReturnType<float, int> {typedef float type;};
+  template<> struct vectorTDReturnType<int, float> {typedef float type;};
+  template<> struct vectorTDReturnType<float, bool> {typedef float type;};
+	template<> struct vectorTDReturnType<bool, float> {typedef float type;};
+  template<> struct vectorTDReturnType<double, unsigned int> {typedef double type;};
+  template<> struct vectorTDReturnType<unsigned int, double> {typedef double type;};
+  template<> struct vectorTDReturnType<double, int> {typedef double type;};
+  template<> struct vectorTDReturnType<int, double> {typedef double type;};
+  template<> struct vectorTDReturnType<double, bool> {typedef double type;};
+  template<> struct vectorTDReturnType<bool, double> {typedef double type;};
+  template<> struct vectorTDReturnType<double, float> {typedef double type;};
+  template<> struct vectorTDReturnType<float,double> {typedef double type;};
+
+  //
+  // Operators are defined as component wise operations.
+  //
+
+  //
+  // Arithmetic operators
+  //
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator+= ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] += v2.vec[i];
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator+= ( vector_td<T,D> &v1, const R &v2 )
+  {
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] += v2;
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator-= ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] -= v2.vec[i];
+  }
+
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  void operator*= ( vector_td<T,D> &v1, const R &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] *= v2;
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator *=  ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+	{
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] *= v2.vec[i];
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator /= ( vector_td<T,D> &v1, const R &v2 )
+  {
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] /= v2;
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void operator /=  ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] /= v2.vec[i];
+  }
+
+  template< class T,class R,  unsigned int D > __inline__ __host__ __device__
+  void component_wise_div_eq ( vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] /= v2.vec[i];
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator+ ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]+v2.vec[i];
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator+ ( const vector_td<T,D> &v1, const R &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]+v2;
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator- ( const vector_td<T,D> &v1, const R &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]-v2;
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator+ (const R &v2, const vector_td<T,D> &v1 )
+  {
+    return v1+v2;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator- ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]-v2.vec[i];
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator- ( const vector_td<T,D> &v1)
+  {
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = -v1.vec[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> component_wise_mul ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]*v2.vec[i];
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__
+  vector_td<T,D> component_wise_mul ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 )
+  {
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]*v2.vec[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator* ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ )  res.vec[i] = v1.vec[i]*v2.vec[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator* ( const vector_td<T,D> &v1, const R &v2 )
+  { 
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]*v2;
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator* ( const R &v1, const vector_td<T,D> &v2 )
+  { 
+    return v2*v1;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator/ ( const vector_td<T,D> &v1, const R &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = v1.vec[i]/v2;
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> operator/ ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    vector_td<typename vectorTDReturnType<T,R>::type,D> res = v1;
+    for(unsigned int i=0; i<D; i++ ) res[i] /= v2[i];
+    return res;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<typename vectorTDReturnType<T,R>::type,D> component_wise_div ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    return v1/v2;
+  }
+
+  // 
+  // "Strong" comparison operators
+  //
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator== ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] == v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator!= ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if((v1.vec[i] != v2.vec[i])) return true;
+    return false;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator&& ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] && v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  bool operator|| ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] || v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__ 
+  bool operator< ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] < v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  bool operator<= ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] <= v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool operator> ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] > v2.vec[i])) return false;
+    return true;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool operator>= ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(!(v1.vec[i] >= v2.vec[i])) return false;
+    return true;
+  }
+
+  //
+  // "Weak" comparison "operators"
+  //
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] == v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__
+  bool weak_not_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] != v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__
+  bool weak_and ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] && v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_or ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] || v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_less ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] < v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_less_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] <= v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__
+  bool weak_greater ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] > v2.vec[i]) return true;
+    return false;
+  }
+
+  template< class T, class R, unsigned int D > __inline__ __host__ __device__ 
+  bool weak_greater_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    for(unsigned int i=0; i<D; i++ ) if(v1.vec[i] >= v2.vec[i]) return true;
+    return false;
+  }
+
+  //
+  // Vector comparison "operators"
+  //
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_equal ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 )
+  { 
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] == v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_not_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] != v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<T,D> vector_and ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] && v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_or ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] || v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_less ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] < v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_less_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  { 
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] <= v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_greater ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {
+    vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] > v2.vec[i]);
+    return res;
+  }
+
+  template< class T,class R, unsigned int D > __inline__ __host__ __device__
+  vector_td<bool,D> vector_greater_equal ( const vector_td<T,D> &v1, const vector_td<R,D> &v2 )
+  {  
+  	vector_td<bool,D> res;
+    for(unsigned int i=0; i<D; i++ ) res.vec[i] = (v1.vec[i] >= v2.vec[i]);
+    return res;
+  }
+
+  //
+  // Integer only operators
+  //
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  void operator<<= ( vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] <<= shifts;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  void operator>>= ( vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] >>= shifts;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator<< ( const vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    vector_td<T,D> res = v1;
+    res <<= shifts;
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator>> ( const vector_td<T,D> &v1, size_t shifts ) 
+  { 
+    vector_td<T,D> res = v1;
+    res >>= shifts;
+    return res;
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  void operator%= ( vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    for(unsigned int i=0; i<D; i++ ) v1.vec[i] %= v2.vec[i];
+  }
+
+  template< class T, unsigned int D > __inline__ __host__ __device__ 
+  vector_td<T,D> operator% ( const vector_td<T,D> &v1, const vector_td<T,D> &v2 ) 
+  { 
+    vector_td<T,D> res = v1;
+    res %= v2;
+    return res;
+  }
+}
diff --git a/toolboxes/core/vector_td_utilities.h b/toolboxes/core/vector_td_utilities.h
new file mode 100644
index 0000000..1380290
--- /dev/null
+++ b/toolboxes/core/vector_td_utilities.h
@@ -0,0 +1,482 @@
+/** \file vector_td_utilities.h
+    \brief The class vector_td defines a D-dimensional vector of type T.
+
+    The class vector_td defines a D-dimensional vector of type T.
+    It is used in the Gadgetron to represent small (one- to four-dimensional) vectors only.
+    For larger vectors consider using the NDArray class instead.
+    The vector_td class can be used on both the cpu and gpu.
+    The accompanying headers vector_td_opeators.h and vector_td_utilities.h define most of the functionality.
+*/
+
+#pragma once
+
+#include "vector_td.h"
+#include "vector_td_operators.h"
+#include "real_utilities.h"
+#include "core_defines.h"
+
+#include <float.h>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+#ifdef max
+#undef max
+#endif
+
+#ifdef min
+#undef min
+#endif
+
+#ifndef __CUDA_ARCH__ // workaround for nvcc
+using std::ceil;  
+using std::abs;   
+using std::floor; 
+using std::sqrt;
+#endif
+
+namespace Gadgetron{
+
+  // Windows/Cuda has some issues when using min and max.
+  // For now we define our own implementation
+
+  template <class T> __inline__ __host__ __device__ const T& _vector_td_min (const T& a, const T& b) {
+    return (a>b)?b:a;
+  }
+  template <class T> __inline__ __host__ __device__ const T& _vector_td_max (const T& a, const T& b) {
+    return (a<b)?b:a;
+  }
+
+  //
+  // Get/set operations on vector_td<T,D>
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ T 
+  get( const vector_td<T,D>& vec, unsigned int dim ) { return vec[dim]; }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ void 
+  set( vector_td<T,D> &vec, unsigned int dim, T val ) { vec[dim] = val; }
+
+  //
+  // In-place operations
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  void clear( vector_td<T,D> &vec, const T &val = T(0) )
+  {
+    for (unsigned int i=0; i<D; i++) {
+      vec[i] = val;
+    }
+  }
+  
+  //
+  // Component-wise math operations
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<T,D> abs( const vector_td<T,D>& vec )
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = std::abs(vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<int,D> sgn( const vector_td<T,D>& vec )
+  {
+    vector_td<int,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = sgn(vec[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<REAL,D> ceil( const vector_td<REAL,D> vec )
+  {
+    vector_td<REAL,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = ::ceil(vec[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<REAL,D> floor( const vector_td<REAL,D> vec )
+  {
+    vector_td<REAL,D> res;
+    for (unsigned int i=0; i<D; i++) {
+      res[i] = ::floor(vec[i]);
+    }
+    return res;
+  }
+
+
+  //
+  // Grid <-> index transformations
+  //
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename uintd<D>::Type idx_to_co( unsigned int idx, const vector_td<unsigned,D> dims )
+  {
+    typename uintd<D>::Type co;
+    unsigned int idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename uint64d<D>::Type idx_to_co( size_t idx, const vector_td<size_t,D> dims )
+  {
+    typename uint64d<D>::Type co;
+    size_t idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename intd<D>::Type idx_to_co( int idx, const vector_td<int,D> dims )
+  {
+    typename intd<D>::Type co;
+    int idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  typename int64d<D>::Type idx_to_co( long long idx, const vector_td<long long,D> dims )
+  {
+    typename int64d<D>::Type co;
+    long long idx_tmp = idx;
+    for (unsigned int i=0; i<D; i++) {
+      co[i] = idx_tmp%dims[i];
+      idx_tmp -= co[i];
+      idx_tmp /= dims[i];
+    }
+    return co;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__
+  unsigned int co_to_idx( const vector_td<unsigned int,D> co, const vector_td<unsigned int,D> dims )
+  {
+    unsigned int idx = 0;
+    unsigned int block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+
+  template<unsigned int D> __inline__ __host__ __device__
+  size_t co_to_idx( const vector_td< size_t,D> co, const vector_td<size_t,D> dims )
+  {
+    size_t idx = 0;
+    size_t block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+
+  template<unsigned int D> __inline__ __host__ __device__
+  int co_to_idx( const vector_td<int,D> co, const vector_td<int,D> dims )
+  {
+    int idx = 0;
+    int block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+
+  template<unsigned int D> __inline__ __host__ __device__
+  long long co_to_idx( const vector_td<long long,D> co, const vector_td<long long,D> dims )
+  {
+    long long idx = 0;
+    long long block_size = 1;
+    for (unsigned int i=0; i<D; i++) {
+      idx += (block_size*co[i]);
+      block_size *= dims[i];
+    }
+    return idx;
+  }
+  
+  template<unsigned int D> __inline__ __host__ __device__ 
+  unsigned int co_to_idx( const vector_td<unsigned int,D> co, 
+                          const vector_td<unsigned int,D> dims, 
+                          const vector_td<unsigned int,D> order )
+  {
+    unsigned int idx = 0;
+    unsigned int block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  size_t co_to_idx( const vector_td<size_t,D> co, 
+                    const vector_td<size_t,D> dims, 
+                    const vector_td<unsigned int,D> order )
+  {
+    size_t idx = 0;
+    size_t block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<int D> __inline__ __host__ __device__ 
+  int co_to_idx( const vector_td<int,D> co, 
+                 const vector_td<int,D> dims, 
+                 const vector_td<unsigned int,D> order )
+  {
+    int idx = 0;
+    int block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<unsigned int D> __inline__ __host__ __device__ 
+  long long co_to_idx( const vector_td<long long,D> co, 
+                       const vector_td<long long,D> dims, 
+                       const vector_td<unsigned int,D> order )
+  {
+    long long idx = 0;
+    long long block_size = 1;
+    for (unsigned int i=0; i<D; i++){
+      idx += (block_size*co.d[order[i]]);
+      block_size *= dims.d[order[i]];
+    }
+    return idx;
+  } 
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  vector_td<T,D> counting_vec()
+  {
+    vector_td<T,D> res;
+    for(unsigned int i=0; i<D; i++) {
+      res[i]=T(i);
+    }
+    return res;
+  }
+
+  //
+  // Conversion between vector_td and std::vector
+  //
+
+  template<class T, unsigned int D> inline
+  std::vector<T> to_std_vector( vector_td<T,D> vec )
+  {
+    std::vector<T> out(D);
+    for(unsigned int i=0; i<D; i++ )
+      out[i] = vec[i];
+    return out;
+  }
+
+  template<class T, unsigned int D> inline
+  vector_td<T,D> from_std_vector( std::vector<T> _vector )
+  {
+    vector_td<T,D> out;
+    for( unsigned int i=0; i<D; i++ ){
+      if( i<_vector.size() )
+        out[i] = _vector[i];
+      else
+        out[i] = T(1);
+    }
+    return out;
+  }
+
+  //
+  // Reductions on vector_td<T,D>
+  //
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  T prod( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res *= vec[i];
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  T sum( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res += vec[i];
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  T dot( const vector_td<T,D>& vec1, const vector_td<T,D>& vec2 )
+  {
+    T res = (vec1[0]*vec2[0]);
+    for (unsigned int i=1; i<D; i++){
+      res += (vec1[i]*vec2[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T max( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res = _vector_td_max(res,vec[i]);
+    }
+    return res;
+  }
+  
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T min( const vector_td<T,D>& vec )
+  {
+    T res = vec[0];
+    for (unsigned int i=1; i<D; i++){
+      res = _vector_td_min(res,vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amin( const vector_td<T,D>& vec1, const vector_td<T,D>& vec2)
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_min(vec1[i],vec2[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amax( const vector_td<T,D>& vec1, const vector_td<T,D>& vec2)
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_max(vec1[i],vec2[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amin( const vector_td<T,D>& vec1, T val)
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_min(vec1[i],val);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  vector_td<T,D> amax( const vector_td<T,D>& vec1, T val )
+  {
+    vector_td<T,D> res;
+    for (unsigned int i=0; i<D; i++){
+      res[i] = _vector_td_max(vec1[i],val);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T max_not_nan( const vector_td<T,D>& vec )
+  {
+    unsigned int i=0;
+    while (isnan(vec[i])) i++;
+    if (i >= D) return 0;
+    T res = vec[i];
+    for (++i; i<D; i++){
+      if (!isnan(vec[i])) res = _vector_td_max(res,vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__
+  T min_not_nan( const vector_td<T,D>& vec )
+  {
+    unsigned int i=0;
+    while (isnan(vec[i])) i++;
+    T res = vec[i];
+    for (++i; i<D; i++){
+      if (!isnan(vec[i])) res = _vector_td_min(res,vec[i]);
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  unsigned int argmin( const vector_td<T,D>& vec )
+  {
+    unsigned int res= 0;
+    for (unsigned int i=1; i<D; i++){
+      if (vec[i] < vec[res] ) res = i;
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  unsigned int argmin_not_nan( const vector_td<T,D>& vec )
+  {
+    unsigned int res= 0;
+    for (unsigned int i=1; i<D; i++){
+      if (vec[i] < vec[res] && !isnan(vec[i])) res = i;
+    }
+    return res;
+  }
+
+  template<class T, unsigned int D> __inline__ __host__ __device__ 
+  unsigned int argmax( const vector_td<T,D>& vec )
+  {
+    unsigned int res= 0;
+    for (unsigned int i=1; i<D; i++){
+      if (vec[i] > vec[res] ) res = i;
+    }
+    return res;
+  }
+
+  //
+  // Reductions on reald<REAL,D>
+  //
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  REAL norm_squared( const vector_td<REAL,D> vec )
+  {
+    REAL res = REAL(0);
+    for (unsigned int i=0; i<D; i++){
+      res += (vec[i]*vec[i]);
+    }
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __inline__ __host__ __device__ 
+  REAL norm( const vector_td<REAL,D> vec )
+  {
+    return ::sqrt(norm_squared<REAL,D>(vec));
+  }
+
+}
diff --git a/toolboxes/gadgettools/CMakeLists.txt b/toolboxes/gadgettools/CMakeLists.txt
new file mode 100644
index 0000000..b232e82
--- /dev/null
+++ b/toolboxes/gadgettools/CMakeLists.txt
@@ -0,0 +1,58 @@
+
+IF (WIN32)
+    ADD_DEFINITIONS(-DTIXML_USE_STL)
+ENDIF (WIN32)
+
+find_package(XSD REQUIRED)
+find_package(XercesC REQUIRED)
+
+include_directories(${ACE_INCLUDE_DIR} 
+                    ${Boost_INCLUDE_DIR} 
+                    ${XSD_INCLUDE_DIR} 
+                    ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+                    ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                    ${CMAKE_SOURCE_DIR}/apps/gadgetron
+                    ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+                    ${ISMRMRD_XSD_INCLUDE_DIR} 
+                    ${ISMRMRD_INCLUDE_DIR} )
+
+#Process the XSD files
+SET(XSDS schema/gadgetron.xsd)
+SET(XSD_ARGS cxx-tree --generate-serialization)
+WRAP_XSD(XSDS_SOURCES XSD_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}/schema ${XSDS} OPTIONS ${XSD_ARGS})
+INCLUDE_DIRECTORIES(${XSD_INCLUDES} ${XERCESC_INCLUDE_DIR})
+
+message("XSDS_SOURCES is " ${XSDS_SOURCES})
+
+if (MKL_FOUND)
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+endif (MKL_FOUND)
+
+add_library(gadgettools SHARED ${XSDS_SOURCES} 
+                               GadgetImageMessageReader.h 
+                               GadgetImageMessageWriter.h 
+                               gadgettools_export.h 
+                               GadgetronSlotContainer.h 
+                               GadgetronConnector.h 
+                               GadgetronConnector.cpp 
+                               GadgetServerAcceptor.h 
+                               GadgetServerAcceptor.cpp 
+                               GadgetStreamController.h 
+                               GadgetStreamController.cpp
+                               GadgetCloudController.h 
+                               GadgetronCloudConnector.h )
+
+target_link_libraries(gadgettools optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} ${XERCESC_LIBRARIES})
+
+install(TARGETS gadgettools DESTINATION lib)
+
+install (FILES  GadgetImageMessageReader.h 
+            GadgetImageMessageWriter.h
+            GadgetronConnector.h
+            gadgettools_export.h
+            GadgetronSlotContainer.h
+            GadgetServerAcceptor.h
+            GadgetStreamController.h
+            DESTINATION include)
diff --git a/toolboxes/gadgettools/GadgetCloudController.h b/toolboxes/gadgettools/GadgetCloudController.h
new file mode 100644
index 0000000..4b9b523
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetCloudController.h
@@ -0,0 +1,654 @@
+
+#pragma once
+
+#include "ace/Log_Msg.h"
+#include "ace/Synch.h"
+#include "ace/Reactor.h"
+#include "ace/WFMO_Reactor.h"
+#include "ace/TP_Reactor.h"
+#include "ace/SOCK_Stream.h"
+#include "ace/Stream.h"
+#include "ace/Message_Queue.h"
+#include "ace/Svc_Handler.h"
+#include "ace/Reactor_Notification_Strategy.h"
+
+#include <complex>
+#include <vector>
+#include "boost/tuple/tuple.hpp"
+#include "boost/tuple/tuple_comparison.hpp"
+#include "boost/tuple/tuple_io.hpp"
+
+#include "gadgettools_export.h"
+#include "Gadgetron.h"
+#include "Gadget.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronCloudConnector.h"
+#include "GadgetImageMessageReader.h"
+#include "GadgetImageMessageWriter.h"
+
+namespace Gadgetron
+{
+
+template<typename JobType> 
+class GadgetCloudJobProcessHandler
+{
+public:
+
+    GadgetCloudJobProcessHandler() {}
+    virtual ~GadgetCloudJobProcessHandler() {}
+
+    virtual bool processJob(int jobID, JobType& ajob) { return true; }
+};
+
+template<typename JobType> 
+class GadgetCloudController : public ACE_Task<ACE_MT_SYNCH>
+{
+public:
+
+    typedef boost::tuple<std::string, std::string, std::string, unsigned int> CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    GadgetCloudController();
+    virtual ~GadgetCloudController();
+
+    // this GadgetCloudController runs in the passive mode
+    virtual int open(void* = 0);
+
+    virtual int close(unsigned long flags);
+
+    // create connector and register the reader and writer for every connector
+    int createConnector(const CloudType& cloud, 
+        size_t msgID_reader, std::vector<GadgetMessageReader*>& readers, 
+        size_t msgID_writer, std::vector<GadgetMessageWriter*>& writers);
+
+    // connect to the cloud host, need to call createConnector first
+    // hostnames: the host name or IP addresses for every node
+    // port_nos: port number for every node
+    // xmlfiles: the xml configuration file name sent to every node
+    int connectToCloud(const CloudType& cloud);
+
+    // send jobs to the node and wait for jobs to be returned
+    // for every job, the node id identify which nodes to send this job
+    // this call can be called repeated and the wait function will wait for all jobs ever sent
+    int runJobsOnCloud(std::vector<JobType*>& job_list, std::vector<JobType*>& completed_job_list, const std::vector<int>& node_ids);
+    // function to ease the calling
+    int runJobsOnCloud(std::vector<JobType>& job_list, std::vector<JobType>& completed_job_list, const std::vector<int>& node_ids);
+
+    // should be called after calling runJobsOnCloud
+    int waitForJobToComplete();
+
+    // send close message to all nodes
+    int closeCloudNode();
+
+    virtual int handle_close (ACE_HANDLE handle, ACE_Reactor_Mask close_mask);
+
+    // set jobs on a node to be completed
+    // if jobID===-1, all jobs for this node is set to be completed
+    int setJobsTobeCompleted(unsigned int nodeID, int jobID=-1);
+
+    // append the job list
+    int appendJobList(std::vector<JobType*>& job_list, 
+        std::vector<JobType*>& completed_job_list, 
+        std::vector<int>& node_id_used, std::vector<int>& job_status);
+
+    // list to store jobs sent to nodes
+    std::vector<JobType*> job_list_;
+    // list to store completed jobs from the nodes
+    std::vector<JobType*> completed_job_list_;
+    // for every job, indicate which node a job is sent to
+    std::vector<int> node_id_used_;
+    // job status, 0/-1 : completed/not completed
+    std::vector<int> job_status_;
+
+    // a function handler to process job after receive
+    // this is a hook to give user a chance to do some processing after receiving every job
+    GadgetCloudJobProcessHandler<JobType>* job_handler_;
+
+private:
+
+    // connector to every node
+    // one connector for a node
+    // node id starts from 0, and increase by 1
+    std::vector<GadgetronCloudConnector<JobType>* > cloud_connectors_;
+
+    size_t cloud_msg_id_reader_;
+    size_t cloud_msg_id_writer_;
+
+    // number of available nodes in the cloud
+    unsigned int number_of_nodes_;
+
+    // node status, 0/-1 : available/unavailable
+    std::vector<int> node_status_;
+
+    // to protect the access to job_status_ and node_id_used_
+    ACE_Thread_Mutex cloud_controller_mutex_;
+};
+
+template <typename JobType> 
+GadgetCloudController<JobType>::GadgetCloudController() : cloud_msg_id_reader_(GADGET_MESSAGE_CLOUD_JOB), cloud_msg_id_writer_(GADGET_MESSAGE_CLOUD_JOB), job_handler_(NULL)
+{
+
+}
+
+template <typename JobType> 
+GadgetCloudController<JobType>::~GadgetCloudController()
+{
+    GADGET_DEBUG1("Into ~GadgetCloudController() ... \n");
+    this->msg_queue()->deactivate();
+
+    for ( unsigned int ii=0; ii<cloud_connectors_.size(); ii++ )
+    {
+        if ( cloud_connectors_[ii] != NULL )
+        {
+            cloud_connectors_[ii]->close();
+            delete cloud_connectors_[ii];
+            cloud_connectors_[ii] = NULL;
+            GADGET_DEBUG1("~GadgetCloudController() : clean connectors done \n");
+        }
+    }
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::open(void* p)
+{
+    GADGET_DEBUG1("GadgetCloudController::open\n");
+
+    // set the high water mark of message queue to be 2GB
+    this->msg_queue()->high_water_mark(24.0*1024*1024*1024);
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::close(unsigned long flags)
+{
+    GADGET_DEBUG1("GadgetCloudController::close\n");
+    int rval = 0;
+    if (flags == 1)
+    {
+        ACE_Message_Block *hangup = new ACE_Message_Block();
+        hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+        if (this->putq(hangup) == -1)
+        {
+            hangup->release();
+            ACE_ERROR_RETURN( (LM_ERROR,
+                    ACE_TEXT("%p\n"),
+                    ACE_TEXT("GadgetCloudController::close, putq")),
+                    -1);
+        }
+        rval = this->wait();
+    }
+    return rval;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::createConnector(const CloudType& cloud, 
+    size_t msgID_reader, std::vector<GadgetMessageReader*>& readers, 
+    size_t msgID_writer, std::vector<GadgetMessageWriter*>& writers)
+{
+    number_of_nodes_ = cloud.size();
+
+    if ( readers.size() != number_of_nodes_ ) return -1;
+    if ( writers.size() != number_of_nodes_ ) return -1;
+
+    cloud_connectors_.resize(number_of_nodes_, NULL);
+    node_status_.resize(number_of_nodes_, -1);
+
+    cloud_msg_id_reader_ = msgID_reader;
+    cloud_msg_id_writer_ = msgID_writer;
+
+    unsigned int ii;
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        GadgetronCloudConnector<JobType>* con;
+        ACE_NEW_RETURN (con, GadgetronCloudConnector<JobType>, -1);
+
+        cloud_connectors_[ii] = con;
+        cloud_connectors_[ii]->nodeID_ = ii;
+
+        cloud_connectors_[ii]->register_reader(cloud_msg_id_reader_, readers[ii] );
+        cloud_connectors_[ii]->register_writer(cloud_msg_id_writer_, writers[ii] );
+
+        cloud_connectors_[ii]->set_cloud_controller(this);
+    }
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+connectToCloud(const CloudType& cloud)
+{
+    number_of_nodes_ = cloud.size();
+    if ( cloud_connectors_.size() != number_of_nodes_ ) return -1;
+
+    node_status_.resize(number_of_nodes_, -1);
+
+    unsigned int ii;
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        if ( cloud_connectors_[ii] == NULL ) return -1;
+
+        std::string host = cloud[ii].get<0>();
+        std::string port = cloud[ii].get<1>();
+
+        if ( cloud_connectors_[ii]->open(cloud[ii].get<0>(), cloud[ii].get<1>())!=0 )
+        {
+            cloud_connectors_[ii]->set_status(false);
+
+            ACE_Time_Value tv(GADGETRON_TIMEOUT_PERIOD);
+            ACE_OS::sleep(tv);
+
+            GADGET_DEBUG2("Open connection to (%s):%s failed ... \n", host.c_str(), port.c_str());
+        }
+        else
+        {
+            //ACE_Time_Value tv(GADGETRON_TIMEOUT_PERIOD);
+            //ACE_OS::sleep(tv);
+
+            // send the xml file
+            if (cloud_connectors_[ii]->send_gadgetron_configuration_file(cloud[ii].get<2>()) != 0)
+            {
+                ACE_Time_Value tv(GADGETRON_TIMEOUT_PERIOD);
+                ACE_OS::sleep(tv);
+
+                GADGET_DEBUG2("Unable to send XML configuration to the Gadgetron cloud host (%s):%s ... \n", host.c_str(), port.c_str());
+            }
+            else
+            {
+                // indicate this node can be used
+                node_status_[ii] = 0;
+                cloud_connectors_[ii]->set_status(true);
+            }
+        }
+
+        if ( node_status_[ii] == 0 )
+        {
+            GADGET_DEBUG2("--> Node (%s):%s is ready ... \n", host.c_str(), port.c_str());
+        }
+        else
+        {
+            GADGET_DEBUG2("--> Node (%s):%s is NOT ready ... \n", host.c_str(), port.c_str());
+        }
+    }
+
+    bool hasGoodNode = false;
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        if ( node_status_[ii] == 0 )
+        {
+            hasGoodNode = true;
+            break;
+        }
+    }
+
+    if ( !hasGoodNode )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to find even one good node ... \n")));
+        return -1;
+    }
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+runJobsOnCloud(std::vector<JobType*>& job_list, std::vector<JobType*>& completed_job_list, const std::vector<int>& node_ids)
+{
+    ACE_DEBUG((LM_INFO, ACE_TEXT("(%t) GadgetCloudController : into runJobsOnCloud(...) ... \n")));
+
+    if ( job_list.empty() )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list is empty ... \n")));
+        return -1;
+    }
+
+    if ( completed_job_list.empty() )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : completed job list is empty ... \n")));
+        return -1;
+    }
+
+    if ( job_list.size() != completed_job_list.size() )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list size does not match ... \n")));
+        return -1;
+    }
+
+    if ( job_list.size() != node_ids.size() )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list size does not match the node id size ... \n")));
+        return -1;
+    }
+
+    std::vector<int> node_ids_used(node_ids);
+
+    unsigned int numOfJobs = job_list.size();
+    std::vector<int> job_status(numOfJobs, -1);
+
+    unsigned int ii;
+    for( ii=0; ii<numOfJobs; ii++ )
+    {
+        int nodeID = node_ids_used[ii];
+        if ( nodeID == -1 )
+        {
+            job_status[ii] = 0;
+            continue;
+        }
+
+        if ( nodeID > number_of_nodes_ )
+        {
+            nodeID %= number_of_nodes_;
+        }
+
+        /*while ( node_status_[nodeID] < 0 )
+        {
+            nodeID--;
+            if ( nodeID < 0 ) nodeID = number_of_nodes_-1;
+        }
+
+        if ( nodeID != node_ids_used[ii] ) node_ids_used[ii] = nodeID;*/
+
+        if ( node_status_[nodeID] < 0 )
+        {
+            node_ids_used[ii] = -1; // local node to perform this job
+            job_status[ii] = 0;
+        }
+
+        GADGET_DEBUG2("--> node for job %d is %d ... \n", ii, node_ids_used[ii]);
+    }
+
+    // append incoming jobs into the list
+    unsigned int startJobID = job_list_.size();
+
+    if ( this->appendJobList(job_list, completed_job_list, node_ids_used, job_status) == -1 )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to append job list ... \n")));
+        return -1;
+    }
+
+    for( ii=0; ii<numOfJobs; ii++ )
+    {
+        int nodeID = node_ids_used[ii];
+        if ( nodeID == -1 )
+        {
+            GADGET_DEBUG2("--> node for job %d is NOT ready ... \n", ii+startJobID);
+            continue;
+        }
+
+        // send job to a node
+        GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+                new GadgetContainerMessage<GadgetMessageIdentifier>();
+
+        m1->getObjectPtr()->id = cloud_msg_id_writer_;
+
+        GadgetContainerMessage<int>* m2 =
+                new GadgetContainerMessage<int>();
+
+        *(m2->getObjectPtr()) = ii+startJobID;
+
+        GadgetContainerMessage<JobType>* m3 =
+                new GadgetContainerMessage<JobType>();
+
+        *(m3->getObjectPtr()) = *(job_list[ii]);
+        m1->cont(m2);
+        m2->cont(m3);
+
+        if ( node_status_[nodeID] == 0 )
+        {
+            if (cloud_connectors_[nodeID]->putq(m1) == -1)
+            {
+                ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send job package %d on queue for node %d \n"), ii+startJobID, nodeID));
+                return -1;
+            }
+            else
+            {
+                GADGET_DEBUG2("Send job %d to node %d ... \n", ii+startJobID, nodeID);
+            }
+        }
+    }
+
+    GADGET_DEBUG1("GadgetCloudController - all jobs sent ... \n");
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+runJobsOnCloud(std::vector<JobType>& job_list, std::vector<JobType>& completed_job_list, const std::vector<int>& node_ids)
+{
+    if ( job_list.size() != completed_job_list.size() )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list size does not match ... \n")));
+        return -1;
+    }
+
+    if ( job_list.size() != node_ids.size() )
+    {
+        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list size does not match the node id size ... \n")));
+        return -1;
+    }
+
+    std::vector<JobType*> jobPtr(job_list.size(), NULL);
+    std::vector<JobType*> completedJobPtr(completed_job_list.size(), NULL);
+
+    unsigned int N = job_list.size();
+
+    unsigned int ii;
+    for ( ii=0; ii<N; ii++ )
+    {
+        jobPtr[ii] = &job_list[ii];
+        completedJobPtr[ii] = &completed_job_list[ii];
+    }
+
+    return runJobsOnCloud(jobPtr, completedJobPtr, node_ids);
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::
+closeCloudNode()
+{
+    GADGET_DEBUG1("GadgetCloudController : into closeCloudNode(...) ... \n");
+
+    unsigned int ii;
+
+    std::vector<bool> closeMsgSent(number_of_nodes_, false);
+    for( ii=0; ii<number_of_nodes_; ii++ )
+    {
+        int nodeID = ii;
+
+        if ( !closeMsgSent[nodeID] )
+        {
+            closeMsgSent[nodeID] = true;
+
+            // send the close message for this node
+            GadgetContainerMessage<GadgetMessageIdentifier>* m = new GadgetContainerMessage<GadgetMessageIdentifier>();
+            m->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+
+            if (cloud_connectors_[nodeID]->putq(m) == -1)
+            {
+                ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send CLOSE package on queue for node %d \n"), nodeID));
+                return -1;
+            }
+        }
+    }
+
+    GADGET_DEBUG1("GadgetCloudController - close message sent to all nodes ... \n");
+
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::waitForJobToComplete()
+{
+    // block the caller thread
+    GADGET_DEBUG1("GadgetCloudController waitForJobToComplete ... \n");
+
+    ACE_Message_Block *mb = 0;
+    ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+    //collect a incoming package a package if we have one
+    while (this->getq (mb) != -1)
+    {
+        GadgetContainerMessage<int>* m_jobID =
+            AsContainerMessage<int>(mb);
+
+        if ( !m_jobID )
+        {
+            ACE_DEBUG ((LM_INFO, ACE_TEXT ("Invalid message id in the GadgetCloudController queue\n")));
+            break;
+        }
+
+        int jobID = *(m_jobID->getObjectPtr());
+
+        if ( jobID != -1 )
+        {
+            GadgetContainerMessage<JobType>* job =
+                AsContainerMessage<JobType>(mb->cont());
+
+            if ( !job )
+            {
+                ACE_DEBUG ((LM_INFO, ACE_TEXT ("Invalid message obj in the GadgetCloudController queue\n")));
+                break;
+            }
+
+            *(completed_job_list_[jobID]) = *(job->getObjectPtr());
+            job_status_[jobID] = 0;
+
+            ACE_DEBUG ((LM_INFO, ACE_TEXT ("--> receive completed job : %d ... \n"), jobID));
+
+            if ( job_handler_ != NULL )
+            {
+                if ( !job_handler_->processJob( jobID, *(completed_job_list_[jobID]) ) )
+                {
+                    ACE_DEBUG ((LM_INFO, ACE_TEXT ("job_handler_->processJob after receiving failed\n")));
+                }
+            }
+        }
+        else
+        {
+            ACE_DEBUG ((LM_INFO, ACE_TEXT ("--> receive jobID == -1 ... \n")));
+        }
+
+        mb->release();
+
+        // if all jobs are received, notice the caller thread
+        bool allJobProcessed = true;
+        {
+            ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+            for ( unsigned int ii=0; ii<job_status_.size(); ii++ )
+            {
+                if ( job_status_[ii] != 0 )
+                {
+                    allJobProcessed = false;
+                    break;
+                }
+            }
+        }
+
+        if ( allJobProcessed )
+        {
+            ACE_DEBUG ((LM_INFO, ACE_TEXT ("All jobs are completed and returned on GadgetCloudController queue\n")));
+            break;
+        }
+    }
+
+    // need to wait for all reader task to complete
+    for( unsigned int ii=0; ii<number_of_nodes_; ii++ )
+    {
+        if ( cloud_connectors_[ii]->status() )
+        {
+            cloud_connectors_[ii]->wait();
+        }
+    }
+
+    ACE_DEBUG((LM_INFO, ACE_TEXT("(%t) GadgetCloudController waitForJobToComplete done ... \n")));
+    return 0;
+}
+
+template <typename JobType> 
+int GadgetCloudController<JobType>::handle_close(ACE_HANDLE handle, ACE_Reactor_Mask close_mask)
+{
+    GADGET_DEBUG1("GadgetCloudController handling close...\n");
+    return this->wait();
+}
+
+template<typename JobType> 
+int GadgetCloudController<JobType>::setJobsTobeCompleted(unsigned int nodeID, int jobID)
+{
+    ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+    try
+    {
+        unsigned int N = this->node_id_used_.size();
+        unsigned int ii;
+        for ( ii=0; ii<N; ii++ )
+        {
+            if ( this->node_id_used_[ii] == nodeID )
+            {
+                if ( jobID>=0 && jobID<this->job_status_.size() )
+                {
+                    this->job_status_[jobID] = 0;
+                }
+                else
+                {
+                    if ( this->job_status_[ii]!= 0 ) this->job_status_[ii] = 0;
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudController, setJobsTobeCompleted() failed ... \n")) );
+        return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetCloudController<JobType>::appendJobList(std::vector<JobType*>& job_list, 
+        std::vector<JobType*>& completed_job_list, 
+        std::vector<int>& node_id_used, std::vector<int>& job_status)
+{
+    ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, cloud_controller_mutex_, -1);
+    try
+    {
+        unsigned int N = job_list.size();
+
+        if ( completed_job_list.size() != N )
+        {
+            ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController appendJobList: job list size does not match ... \n")));
+            return -1;
+        }
+
+        if ( node_id_used.size() != N )
+        {
+            ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController appendJobList: node_id_used size does not match ... \n")));
+            return -1;
+        }
+
+        if ( job_status.size() != N )
+        {
+            ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController appendJobList: job_status size does not match ... \n")));
+            return -1;
+        }
+
+        unsigned int ii;
+        for ( ii=0; ii<N; ii++ )
+        {
+            job_list_.push_back(job_list[ii]);
+            completed_job_list_.push_back(completed_job_list[ii]);
+            node_id_used_.push_back(node_id_used[ii]);
+            job_status_.push_back(job_status[ii]);
+        }
+    }
+    catch(...)
+    {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetCloudController, appendJobList() failed ... \n")) );
+        return -1;
+    }
+
+    return 0;
+}
+
+}
diff --git a/toolboxes/gadgettools/GadgetImageMessageReader.h b/toolboxes/gadgettools/GadgetImageMessageReader.h
new file mode 100644
index 0000000..700eaaf
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetImageMessageReader.h
@@ -0,0 +1,71 @@
+#ifndef GADGETSOCKETRECEIVER_H
+#define GADGETSOCKETRECEIVER_H
+
+#include "ace/SOCK_Stream.h"
+#include "ace/Task.h"
+
+#include <complex>
+#include <iostream>
+
+#include "GadgetMRIHeaders.h"
+#include "ismrmrd.h"
+#include "hoNDArray.h"
+#include "GadgetMessageInterface.h"
+
+namespace Gadgetron
+{
+
+/**
+Default implementation of GadgetMessageReader for Image messages
+*/
+
+template <typename T> class GadgetImageMessageReader : public GadgetMessageReader
+{
+
+public:
+    virtual ACE_Message_Block* read(ACE_SOCK_Stream* stream) 
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* imgh = 
+            new GadgetContainerMessage<ISMRMRD::ImageHeader>();
+
+        ssize_t recv_count = 0;
+        if ((recv_count = stream->recv_n(imgh->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0) {
+            ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetImageMessageReader, failed to read IMAGE Header\n")) );
+            imgh->release();
+            return 0;
+        }
+
+        std::vector<size_t> dims(3);
+        dims[0] = imgh->getObjectPtr()->matrix_size[0];
+        dims[1] = imgh->getObjectPtr()->matrix_size[1];
+        dims[2] = imgh->getObjectPtr()->matrix_size[2];
+
+        if (imgh->getObjectPtr()->channels > 1) {
+            dims.push_back(imgh->getObjectPtr()->channels);
+        } 
+
+        GadgetContainerMessage< hoNDArray< T > >* data =
+            new GadgetContainerMessage< hoNDArray< T > >();
+
+        try{ data->getObjectPtr()->create(&dims);}
+        catch (std::runtime_error &err){
+            GADGET_DEBUG_EXCEPTION(err,"GadgetImageMessageReader, failed to allocate memory\n");
+            imgh->release();
+            return 0;
+        }
+
+        imgh->cont(data);
+
+        if ((recv_count = stream->recv_n(data->getObjectPtr()->get_data_ptr(), sizeof(T)*data->getObjectPtr()->get_number_of_elements())) <= 0) {
+            ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetImageMessageReader, failed to read data from socket\n")) );
+            imgh->release();
+            return 0;
+        }
+
+        return imgh;
+    }
+};
+
+}
+
+#endif //GADGETSOCKETRECEIVER_H
diff --git a/toolboxes/gadgettools/GadgetImageMessageWriter.h b/toolboxes/gadgettools/GadgetImageMessageWriter.h
new file mode 100644
index 0000000..0ab154f
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetImageMessageWriter.h
@@ -0,0 +1,84 @@
+#ifndef GADGETSOCKETSENDER_H
+#define GADGETSOCKETSENDER_H
+
+#include "ace/SOCK_Stream.h"
+#include "ace/Task.h"
+
+#include <complex>
+
+#include "GadgetMRIHeaders.h"
+#include "hoNDArray.h"
+#include "GadgetContainerMessage.h"
+#include "GadgetMessageInterface.h"
+
+namespace Gadgetron
+{
+
+/**
+Default implementation of GadgetMessageWriter for Image messages
+*/
+
+template <typename T> class GadgetImageMessageWriter : public GadgetMessageWriter
+{
+public:
+    virtual int write(ACE_SOCK_Stream* sock, ACE_Message_Block* mb) 
+    {
+        GadgetContainerMessage<ISMRMRD::ImageHeader>* imagemb = 
+            dynamic_cast< GadgetContainerMessage<ISMRMRD::ImageHeader>* >(mb);
+
+        GadgetContainerMessage< hoNDArray< T > >* datamb =
+            dynamic_cast< GadgetContainerMessage< hoNDArray< T > >* >(imagemb->cont());
+
+        if (!imagemb || !datamb) {
+            ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), GadgetImageMessageWriter invalid image message objects")) );
+            return -1;
+        }
+
+
+        ssize_t send_cnt = 0;
+        GadgetMessageIdentifier id;
+
+        switch (sizeof(T)) {
+        case 2: //Unsigned short
+            id.id = GADGET_MESSAGE_IMAGE_REAL_USHORT;
+            break;
+        case 4: //Float
+            id.id = GADGET_MESSAGE_IMAGE_REAL_FLOAT;
+            break;
+        case 8: //Complex float
+            id.id = GADGET_MESSAGE_IMAGE_CPLX_FLOAT;
+            break;
+        default:
+            ACE_DEBUG( (LM_ERROR, ACE_TEXT("(%P,%l), GadgetImageMessageWriter Wrong data size detected:")) );
+            return -1;
+        }
+
+        if ((send_cnt = sock->send_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+            ACE_DEBUG ((LM_ERROR,
+                ACE_TEXT ("(%P|%t) Unable to send image message identifier\n")));
+
+            return -1;
+        }
+
+        if ((send_cnt = sock->send_n (imagemb->getObjectPtr(), sizeof(ISMRMRD::ImageHeader))) <= 0) {
+            ACE_DEBUG ((LM_ERROR,
+                ACE_TEXT ("(%P|%t) Unable to send image header\n")));
+
+            return -1;
+        }
+
+        if ((send_cnt = sock->send_n (datamb->getObjectPtr()->get_data_ptr(), sizeof(T)*datamb->getObjectPtr()->get_number_of_elements())) <= 0) {
+            ACE_DEBUG ((LM_ERROR,
+                ACE_TEXT ("(%P|%t) Unable to send image data\n")));
+
+            return -1;
+        }
+
+        return 0;
+    }
+
+};
+
+}
+
+#endif //GADGETSOCKETSENDER_H
diff --git a/toolboxes/gadgettools/GadgetServerAcceptor.cpp b/toolboxes/gadgettools/GadgetServerAcceptor.cpp
new file mode 100644
index 0000000..48270b9
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetServerAcceptor.cpp
@@ -0,0 +1,58 @@
+#include "GadgetServerAcceptor.h"
+#include "GadgetStreamController.h"
+
+using namespace Gadgetron;
+
+GadgetServerAcceptor::~GadgetServerAcceptor ()
+{
+  this->handle_close (ACE_INVALID_HANDLE, 0);
+}
+
+int GadgetServerAcceptor::open (const ACE_INET_Addr &listen_addr)
+{
+  if (this->acceptor_.open (listen_addr, 1) == -1)
+    ACE_ERROR_RETURN ((LM_ERROR,
+                       ACE_TEXT ("%p\n"),
+                       ACE_TEXT ("acceptor.open")),
+                      -1);
+  return this->reactor ()->register_handler
+    (this, ACE_Event_Handler::ACCEPT_MASK);
+}
+
+
+
+
+int GadgetServerAcceptor::handle_input (ACE_HANDLE)
+{
+  GadgetStreamController *controller;
+  ACE_NEW_RETURN (controller, GadgetStreamController, -1);
+  auto_ptr<GadgetStreamController> p (controller);
+
+  if (this->acceptor_.accept (controller->peer ()) == -1)
+    ACE_ERROR_RETURN ((LM_ERROR,
+                       ACE_TEXT ("(%P|%t) %p\n"),
+                       ACE_TEXT ("Failed to accept ")
+                       ACE_TEXT ("controller connection")),
+                      -1);
+  p.release ();
+  controller->reactor (this->reactor ());
+  if (controller->open () == -1)
+    controller->handle_close (ACE_INVALID_HANDLE, 0);
+  return 0;
+}
+
+int GadgetServerAcceptor::handle_close (ACE_HANDLE, ACE_Reactor_Mask)
+{
+  ACE_DEBUG( (LM_DEBUG, 
+	      ACE_TEXT("GadgetServerAcceptor::handle_close")) );
+  
+  GADGET_DEBUG1("Close Data Acceptor\n");
+
+  if (this->acceptor_.get_handle () != ACE_INVALID_HANDLE) {
+    ACE_Reactor_Mask m = 
+      ACE_Event_Handler::ACCEPT_MASK | ACE_Event_Handler::DONT_CALL;
+    this->reactor ()->remove_handler (this, m);
+    this->acceptor_.close ();
+  }
+  return 0;
+}
diff --git a/toolboxes/gadgettools/GadgetServerAcceptor.h b/toolboxes/gadgettools/GadgetServerAcceptor.h
new file mode 100644
index 0000000..d79e2b3
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetServerAcceptor.h
@@ -0,0 +1,27 @@
+#ifndef _GADGETSERVERACCEPTOR_H
+#define _GADGETSERVERACCEPTOR_H
+
+#include "ace/SOCK_Acceptor.h"
+#include "ace/Reactor.h"
+#include "gadgettools_export.h"
+
+namespace Gadgetron{
+class EXPORTGADGETTOOLS GadgetServerAcceptor : public ACE_Event_Handler
+{
+public:
+  virtual ~GadgetServerAcceptor ();
+
+  int open (const ACE_INET_Addr &listen_addr);
+
+  virtual ACE_HANDLE get_handle (void) const
+    { return this->acceptor_.get_handle (); }
+
+  virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+
+  virtual int handle_close (ACE_HANDLE handle,
+                            ACE_Reactor_Mask close_mask);
+protected:
+  ACE_SOCK_Acceptor acceptor_;
+};
+}
+#endif //_GADGETSERVERACCEPTOR_H
diff --git a/toolboxes/gadgettools/GadgetStreamController.cpp b/toolboxes/gadgettools/GadgetStreamController.cpp
new file mode 100644
index 0000000..d9586a9
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetStreamController.cpp
@@ -0,0 +1,459 @@
+#include "ace/OS_NS_stdlib.h"
+#include "ace/OS_NS_string.h"
+#include "ace/OS_NS_stdio.h"
+#include "ace/DLL.h"
+#include "ace/DLL_Manager.h"
+#include "ace/OS_NS_netdb.h"
+
+#include "GadgetStreamController.h"
+#include "GadgetContainerMessage.h"
+#include "Gadget.h"
+#include "EndGadget.h"
+
+#include "gadgetron.hxx" //Auto generated class representation of gadgetron XML configuration
+#include "url_encode.h"
+
+#include <complex>
+#include <fstream>
+
+using namespace Gadgetron;
+int GadgetStreamController::open (void)
+{
+	//We will set up the controllers message queue such that when a packet is enqueued write will be triggered.
+	this->notifier_.reactor (this->reactor ());
+	this->msg_queue ()->notification_strategy (&this->notifier_);
+    this->msg_queue()->high_water_mark((size_t)(48.0*1024*1024*1024));
+
+	ACE_TCHAR peer_name[MAXHOSTNAMELEN];
+	ACE_INET_Addr peer_addr;
+	if (peer().get_remote_addr (peer_addr) == 0 &&
+			peer_addr.addr_to_string (peer_name, MAXHOSTNAMELEN) == 0)
+		ACE_DEBUG ((LM_DEBUG,
+				ACE_TEXT ("(%P|%t) Connection from %s\n"),
+				peer_name));
+
+	//We have to have these basic types to be able to receive configuration file for stream
+	readers_.insert(GADGET_MESSAGE_CONFIG_FILE,
+			new GadgetMessageConfigFileReader());
+
+	readers_.insert(GADGET_MESSAGE_CONFIG_SCRIPT,
+			new GadgetMessageScriptReader());
+
+	readers_.insert(GADGET_MESSAGE_PARAMETER_SCRIPT,
+			new GadgetMessageScriptReader());
+
+	GadgetModule *head = 0;
+	GadgetModule *tail = 0;
+
+	if (tail == 0) {
+		Gadget* eg = new EndGadget();
+		if (eg) {
+			eg->set_controller(this);
+		}
+
+		ACE_NEW_RETURN(tail,
+				ACE_Module<ACE_MT_SYNCH>( ACE_TEXT("EndGadget"),
+						eg ),
+						-1);
+
+		stream_.open(0,head,tail);
+	}
+
+	this->writer_task_.open();
+
+	return this->reactor ()->register_handler(this,
+			ACE_Event_Handler::READ_MASK);// | ACE_Event_Handler::WRITE_MASK);
+}
+
+
+int GadgetStreamController::handle_input (ACE_HANDLE)
+{
+	//Reading sequence:
+	GadgetMessageIdentifier id;
+	ssize_t recv_cnt = 0;
+	if ((recv_cnt = peer().recv_n (&id, sizeof(GadgetMessageIdentifier))) <= 0) {
+		ACE_DEBUG ((LM_DEBUG,
+				ACE_TEXT ("(%P|%t) GadgetStreamController, unable to read message identifier\n")));
+		return -1;
+	}
+
+	if (id.id == GADGET_MESSAGE_CLOSE) {
+		GADGET_DEBUG1("Received close signal from client. Closing stream...\n");
+		stream_.close(1); //Shutdown gadgets and wait for them
+		GADGET_DEBUG1("Stream closed\n");
+		GADGET_DEBUG1("Closing writer task\n");
+		this->writer_task_.close(1);
+		GADGET_DEBUG1("Writer task closed\n");
+		return 0;
+	}
+
+	GadgetMessageReader* r = readers_.find(id.id);
+
+	if (!r) {
+		GADGET_DEBUG2("Unrecognized Message ID received: %d\n", id.id);
+		return GADGET_FAIL;
+	}
+
+	ACE_Message_Block* mb = r->read(&peer());
+
+	if (!mb) {
+		GADGET_DEBUG1("GadgetMessageReader returned null pointer\n");
+		return GADGET_FAIL;
+	}
+
+	//We need to handle some special cases to make sure that we can get a stream set up.
+	if (id.id == GADGET_MESSAGE_CONFIG_FILE) {
+		GadgetContainerMessage<GadgetMessageConfigurationFile>* cfgm =
+				AsContainerMessage<GadgetMessageConfigurationFile>(mb);
+
+		if (!cfgm) {
+			GADGET_DEBUG1("Failed to cast message block to configuration file\n");
+			mb->release();
+			return GADGET_FAIL;
+		} else {
+			if (this->configure_from_file(std::string(cfgm->getObjectPtr()->configuration_file)) != GADGET_OK) {
+				GADGET_DEBUG1("GadgetStream configuration failed\n");
+				mb->release();
+				return GADGET_FAIL;
+			} else {
+				mb->release();
+				return GADGET_OK;
+			}
+		}
+	} else if (id.id == GADGET_MESSAGE_CONFIG_SCRIPT) {
+		std::string xml_config(mb->rd_ptr(), mb->length());
+		if (this->configure(xml_config) != GADGET_OK) {
+			GADGET_DEBUG1("GadgetStream configuration failed\n");
+			mb->release();
+			return GADGET_FAIL;
+		} else {
+			mb->release();
+			return GADGET_OK;
+		}
+	}
+
+	ACE_Time_Value wait = ACE_OS::gettimeofday() + ACE_Time_Value(0,10000); //10ms from now
+	if (stream_.put(mb) == -1) {
+		GADGET_DEBUG2("Failed to put stuff on stream, too long wait, %d\n",  ACE_OS::last_error () ==  EWOULDBLOCK);
+		mb->release();
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+
+int GadgetStreamController::output_ready(ACE_Message_Block* mb) 
+{ 
+	int res = this->writer_task_.putq(mb);
+	return res;
+}
+
+
+
+int GadgetStreamController::handle_close (ACE_HANDLE, ACE_Reactor_Mask mask)
+{
+	GADGET_DEBUG1("handle_close called\n");
+
+	if (mask == ACE_Event_Handler::WRITE_MASK)
+		return 0;
+
+	GADGET_DEBUG1("Shutting down stream and closing up shop...\n");
+
+	this->stream_.close();
+
+	mask = ACE_Event_Handler::ALL_EVENTS_MASK |
+			ACE_Event_Handler::DONT_CALL;
+
+	this->reactor ()->remove_handler (this, mask);
+
+	//Empty output queue in case there is something on it.
+	int messages_dropped = this->msg_queue ()->flush();
+
+	if (messages_dropped) {
+		GADGET_DEBUG2("Flushed %d messages from output queue\n", messages_dropped);
+		this->reactor ()->handle_events(); //Flush any remaining events before we delete this Stream Controller
+	}
+
+	// Remove all readers and writers
+	//writers_.clear();
+	readers_.clear();
+
+	//Clear DLL handles (to make DLLs unload if needed)
+	for (size_t i = 0; i < dll_handles_.size(); i++) {
+#if defined WIN32
+		dll_handles_[i]->close(0); //On windows we will not unload the DLLs even when there are no more refs
+#else 
+		dll_handles_[i]->close(0); //On Unix/Mac it seems to be OK to do this
+#endif
+	}
+	dll_handles_.clear();
+
+	GADGET_DEBUG1("Stream is closed\n");
+
+	delete this;
+	return 0;
+}
+
+Gadget* GadgetStreamController::find_gadget(std::string gadget_name)
+{
+	GadgetModule* gm = stream_.find(gadget_name.c_str());
+
+	if (gm) {
+		Gadget* g = dynamic_cast<Gadget*>(gm->writer());
+		return g;
+	} else {
+		GADGET_DEBUG2("Gadget with name %s not found! Returning null pointer\n", gadget_name.c_str());
+	}
+
+	return 0;
+}
+
+int GadgetStreamController::configure_from_file(std::string config_xml_filename)
+{
+
+	char * gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+	ACE_TCHAR config_file_name[4096];
+	ACE_OS::sprintf(config_file_name, "%s/config/%s", gadgetron_home, config_xml_filename.c_str());
+
+	GADGET_DEBUG2("Running configuration: %s\n", config_file_name);
+
+	std::ifstream file (config_file_name, std::ios::in|std::ios::binary|std::ios::ate);
+	if (file.is_open())
+	{
+		size_t size = file.tellg();
+		char* buffer = new char [size];
+		if (!buffer) {
+			GADGET_DEBUG1("Unable to create temporary buffer for configuration file\n");
+			return GADGET_FAIL;
+		}
+		file.seekg (0, std::ios::beg);
+		file.read (buffer, size);
+		file.close();
+		std::string xml_file_contents(buffer,size);
+
+		return configure(xml_file_contents);
+		delete[] buffer;
+
+	} else {
+		GADGET_DEBUG2("Unable to open configuation file: %s\n", config_file_name);
+		return GADGET_FAIL;
+	}
+
+	return GADGET_OK;
+}
+
+int GadgetStreamController::configure(std::string config_xml_string)
+{
+
+	char * gadgetron_home = ACE_OS::getenv("GADGETRON_HOME");
+	ACE_TCHAR schema_file_name[4096];
+	ACE_OS::sprintf(schema_file_name, "%s/schema/gadgetron.xsd", gadgetron_home);
+
+	std::string tmp(schema_file_name);
+	tmp = url_encode(tmp);
+	ACE_OS_String::strncpy(schema_file_name,tmp.c_str(), 4096);
+
+
+	xml_schema::properties props;
+	props.schema_location (
+	  "http://gadgetron.sf.net/gadgetron",
+	  std::string (schema_file_name));
+
+	std::istringstream str_stream(config_xml_string, std::stringstream::in);
+	std::auto_ptr<gadgetron::gadgetronStreamConfiguration> cfg;
+
+	try {
+		cfg = std::auto_ptr<gadgetron::gadgetronStreamConfiguration>(gadgetron::gadgetronStreamConfiguration_(str_stream,0,props));
+		//cfg = std::auto_ptr<gadgetron::gadgetronStreamConfiguration>(gadgetron::gadgetronStreamConfiguration_(std::string(config_file_name)));
+	}  catch (const xml_schema::exception& e) {
+		GADGET_DEBUG2("Failed to parse Gadget Stream Configuration: %s\n", e.what());
+		return GADGET_FAIL;
+	}
+
+	GADGET_DEBUG2("Found %d readers\n", cfg->reader().size());
+	GADGET_DEBUG2("Found %d writers\n", cfg->writer().size());
+	GADGET_DEBUG2("Found %d gadgets\n", cfg->gadget().size());
+
+	for (gadgetron::gadgetronStreamConfiguration::reader_sequence::iterator i (cfg->reader().begin ()); i != cfg->reader().end(); ++i) {
+		long slot = 0;
+		std::string dllname("");
+		std::string classname("");
+
+		slot = i->slot();
+		dllname = i->dll();
+		classname = i->classname();
+
+		GADGET_DEBUG1("--Found reader declaration\n");
+		GADGET_DEBUG2("  Reader dll: %s\n", dllname.c_str());
+		GADGET_DEBUG2("  Reader class: %s\n", classname.c_str());
+		GADGET_DEBUG2("  Reader slot: %d\n", slot);
+
+		GadgetMessageReader* r =
+				load_dll_component<GadgetMessageReader>(dllname.c_str(),
+						classname.c_str());
+
+		if (!r) {
+			GADGET_DEBUG1("Failed to load GadgetMessageReader from DLL\n");
+			return GADGET_FAIL;
+		}
+
+		readers_.insert((unsigned short)slot, r);
+
+	}
+	//Configuration of readers end
+
+
+	//Configuration of writers
+	for (gadgetron::gadgetronStreamConfiguration::writer_sequence::iterator i (cfg->writer().begin ()); i != cfg->writer().end(); ++i) {
+		long slot = 0;
+		std::string dllname("");
+		std::string classname("");
+
+		slot = i->slot();
+		dllname = i->dll();
+		classname = i->classname();
+
+		GADGET_DEBUG1("--Found writer declaration\n");
+		GADGET_DEBUG2("  Reader dll: %s\n", dllname.c_str());
+		GADGET_DEBUG2("  Reader class: %s\n", classname.c_str());
+		GADGET_DEBUG2("  Reader slot: %d\n", slot);
+
+		GadgetMessageWriter* w =
+				load_dll_component<GadgetMessageWriter>(dllname.c_str(),
+						classname.c_str());
+
+		if (!w) {
+			GADGET_DEBUG1("Failed to load GadgetMessageWriter from DLL\n");
+			return GADGET_FAIL;
+		}
+
+		writer_task_.register_writer(slot, w);
+	}
+	//Configuration of writers end
+
+	//Let's configure the stream
+	GADGET_DEBUG2("Processing %d gadgets in reverse order\n",cfg->gadget().size());
+	for (gadgetron::gadgetronStreamConfiguration::gadget_sequence::reverse_iterator i (cfg->gadget().rbegin ()); i != cfg->gadget().rend(); ++i) {
+		std::string gadgetname("");
+		std::string dllname("");
+		std::string classname("");
+
+		gadgetname = i->name();
+		dllname = i->dll();
+		classname = i->classname();
+
+		GADGET_DEBUG1("--Found gadget declaration\n");
+		GADGET_DEBUG2("  Gadget Name: %s\n", gadgetname.c_str());
+		GADGET_DEBUG2("  Gadget dll: %s\n", dllname.c_str());
+		GADGET_DEBUG2("  Gadget class: %s\n", classname.c_str());
+
+		GadgetModule* m = create_gadget_module(dllname.c_str(),
+				classname.c_str(),
+				gadgetname.c_str());
+
+		if (!m) {
+			GADGET_DEBUG2("Failed to create GadgetModule from %s:%s\n",
+					classname.c_str(),
+					dllname.c_str());
+			return GADGET_FAIL;
+		}
+
+		Gadget* g = dynamic_cast<Gadget*>(m->writer());//Get the gadget out of the module
+
+		GADGET_DEBUG2("  Gadget parameters: %d\n", i->property().size());
+		for (gadgetron::gadget::property_sequence::iterator p (i->property().begin()); p != i->property().end(); ++p) {
+			std::string pname(p->name());
+			std::string pval(p->value());
+			GADGET_DEBUG2("Setting parameter %s = %s\n", pname.c_str(),pval.c_str());
+			g->set_parameter(pname.c_str(),pval.c_str(),false);
+		}
+
+		if (stream_.push(m) < 0) {
+			GADGET_DEBUG2("Failed to push Gadget %s onto stream\n", gadgetname.c_str());
+			delete m;
+			return GADGET_FAIL;
+		}
+
+	}
+
+	GADGET_DEBUG1("Gadget Stream configured\n");
+	stream_configured_ = true;
+
+	return GADGET_OK;
+}
+
+GadgetModule * GadgetStreamController::create_gadget_module(const char* DLL, 
+		const char* gadget,
+		const char* gadget_module_name)
+{
+
+	Gadget* g = load_dll_component<Gadget>(DLL,gadget);
+
+	if (!g) {
+		GADGET_DEBUG1("Failed to load gadget using factory\n");
+		return 0;
+	}
+
+	g->set_controller(this);
+
+	GadgetModule *module = 0;
+	ACE_NEW_RETURN (module,
+			GadgetModule (gadget_module_name, g),
+			0);
+
+	return module;
+}
+
+
+template <class T>  
+T* GadgetStreamController::load_dll_component(const char* DLL, const char* component_name)
+{
+	ACE_DLL_Manager* dllmgr = ACE_DLL_Manager::instance();
+
+	ACE_DLL_Handle* dll = 0;
+	ACE_SHLIB_HANDLE dll_handle = 0;
+
+	ACE_TCHAR dllname[1024];
+#if defined(WIN32) && defined(_DEBUG)
+	ACE_OS::sprintf(dllname, "%s%sd",ACE_DLL_PREFIX, DLL);
+#else
+	ACE_OS::sprintf(dllname, "%s%s",ACE_DLL_PREFIX, DLL);
+#endif
+
+	ACE_TCHAR factoryname[1024];
+	ACE_OS::sprintf(factoryname, "make_%s", component_name);
+
+	dll = dllmgr->open_dll (dllname, ACE_DEFAULT_SHLIB_MODE, dll_handle );
+
+	if (!dll) {
+		GADGET_DEBUG1("Failed to load DLL, Possible reasons: \n");
+		GADGET_DEBUG1("   * Name of DLL is wrong in XML file \n");
+		GADGET_DEBUG1("   * Path of DLL is not in your DLL search path (LD_LIBRARY_PATH on Unix)\n");
+		GADGET_DEBUG1("   * Path of other DLLs that this DLL depends on is not in the search path\n");
+		return 0;
+	} else {
+		dll_handles_.push_back(dll);
+	}
+
+	//Function pointer
+	typedef T* (*ComponentCreator) (void);
+
+	void *void_ptr = dll->symbol (factoryname);
+	ptrdiff_t tmp = reinterpret_cast<ptrdiff_t> (void_ptr);
+	ComponentCreator cc = reinterpret_cast<ComponentCreator> (tmp);
+
+	if (cc == 0) {
+		GADGET_DEBUG2("Failed to load factory (%s) from DLL (%s)\n", dllname, factoryname);
+		return 0;
+	}
+
+	T* c = cc();
+
+	if (!c) {
+		GADGET_DEBUG1("Failed to create component using factory\n");
+		return 0;
+	}
+
+	return c;
+}
diff --git a/toolboxes/gadgettools/GadgetStreamController.h b/toolboxes/gadgettools/GadgetStreamController.h
new file mode 100644
index 0000000..912f54d
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetStreamController.h
@@ -0,0 +1,559 @@
+#ifndef GADGETSTREAMCONTROLLER_H
+#define GADGETSTREAMCONTROLLER_H
+
+#include "ace/Log_Msg.h"
+#include "ace/Reactor.h"
+#include "ace/SOCK_Stream.h"
+#include "ace/Stream.h"
+#include "ace/Message_Queue.h"
+#include "ace/Svc_Handler.h"
+#include "ace/Reactor_Notification_Strategy.h"
+
+#include <complex>
+#include <vector>
+#include "boost/tuple/tuple.hpp"
+#include "boost/tuple/tuple_comparison.hpp"
+#include "boost/tuple/tuple_io.hpp"
+
+#include "gadgettools_export.h"
+#include "Gadgetron.h"
+#include "Gadget.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronConnector.h"
+#include "GadgetImageMessageReader.h"
+#include "GadgetImageMessageWriter.h"
+
+typedef ACE_Module<ACE_MT_SYNCH> GadgetModule;
+
+namespace Gadgetron{
+
+class EXPORTGADGETTOOLS GadgetStreamController 
+    : public ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_MT_SYNCH>
+{
+public:
+    GadgetStreamController()
+        : stream_configured_(false)
+        , notifier_ (0, this, ACE_Event_Handler::WRITE_MASK)
+        , writer_task_(&this->peer())
+    { }
+
+    virtual ~GadgetStreamController()
+    { 
+        //ACE_DEBUG( (LM_INFO, ACE_TEXT("~GadgetStreamController() called\n")) );
+    }
+
+    //ACE_SOCK_Stream &peer (void) { return this->sock_; }
+
+    int open (void);
+
+    /*
+    virtual ACE_HANDLE get_handle (void) const { 
+    return this->sock_.get_handle (); 
+    }
+    */
+
+    virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+    //virtual int handle_output (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+    virtual int handle_close (ACE_HANDLE handle,
+        ACE_Reactor_Mask close_mask);
+
+    virtual int output_ready(ACE_Message_Block* mb);
+
+    virtual Gadget* find_gadget(std::string gadget_name);
+
+private:
+    ACE_Stream<ACE_MT_SYNCH> stream_;
+    bool stream_configured_;
+    WriterTask writer_task_;
+
+    ACE_Reactor_Notification_Strategy notifier_;
+
+    GadgetMessageReaderContainer readers_;
+
+    std::vector<ACE_DLL_Handle*> dll_handles_;
+
+    virtual int configure(std::string config_xml_string);
+    virtual int configure_from_file(std::string config_xml_filename);
+
+    virtual GadgetModule * create_gadget_module(const char* DLL, const char* gadget, const char* gadget_module_name);
+
+    template <class T>  T* load_dll_component(const char* DLL, const char* component_name);
+
+};
+
+//template<typename JobType> 
+//class GadgetCloudController : public ACE_Task<ACE_MT_SYNCH>
+//{
+//public:
+//
+//    typedef boost::tuple<std::string, std::string, std::string> CloudNodeType;
+//    typedef std::vector<CloudNodeType> CloudType;
+//
+//    GadgetCloudController();
+//    virtual ~GadgetCloudController();
+//
+//    // this GadgetCloudController runs in the passive mode
+//    virtual int open(void* = 0);
+//
+//    virtual int close(unsigned long flags);
+//
+//    // create connector and register the reader and writer for every connector
+//    int createConnector(const CloudType& cloud, 
+//        size_t msgID_reader, std::vector<GadgetMessageReader*>& readers, 
+//        size_t msgID_writer, std::vector<GadgetMessageWriter*>& writers);
+//
+//    // connect to the cloud host, need to call createConnector first
+//    // hostnames: the host name or IP addresses for every node
+//    // port_nos: port number for every node
+//    // xmlfiles: the xml configuration file name sent to every node
+//    int connectToCloud(const CloudType& cloud);
+//
+//    // send jobs to the node and wait for jobs to be returned
+//    // for every job, the node id identify which nodes to send this job
+//    // after sending all jobs, this call will block until all jobs are returned
+//    int runJobsOnCloud(const std::vector<int>& node_ids);
+//
+//    // should be called after calling runJobsOnCloud
+//    int waitForJobToComplete();
+//
+//    // wait for all jobs to come back
+//    // all returned jobs will be put into the completed_job_list_
+//    // this function will not return until all jobs are returned
+//    virtual int svc(void);
+//
+//    // list to store jobs sent to nodes
+//    std::vector<JobType*> job_list_;
+//    // list to store completed jobs from the nodes
+//    std::vector<JobType*> completed_job_list_;
+//
+//private:
+//
+//    // connector to every node
+//    // one connector for a node
+//    // node id starts from 0, and increase by 1
+//    std::vector<GadgetronCloudConnector<JobType>* > cloud_connectors_;
+//
+//    size_t cloud_msg_id_reader_;
+//    size_t cloud_msg_id_writer_;
+//
+//    // number of available nodes in the cloud
+//    unsigned int number_of_nodes_;
+//
+//    // node status, 0/-1 : available/unavailable
+//    std::vector<int> node_status_;
+//
+//    // job status, 0/-1 : completed/not completed
+//    std::vector<int> job_status_;
+//
+//    // a condition variable to wake up the caller thread
+//    ACE_Thread_Mutex mutex;
+//    ACE_Condition_Thread_Mutex* cond_;
+//
+//    ACE_Reactor gt_cloud_rector_;
+//};
+//
+//template <typename JobType> 
+//GadgetCloudController<JobType>::GadgetCloudController() : cloud_msg_id_reader_(GADGET_MESSAGE_CLOUD_JOB), cloud_msg_id_writer_(GADGET_MESSAGE_CLOUD_JOB)
+//{
+//    cond_ = new ACE_Condition_Thread_Mutex(mutex, "GadgetCloudController");
+//}
+//
+//template <typename JobType> 
+//GadgetCloudController<JobType>::~GadgetCloudController()
+//{
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::open(void* p)
+//{
+//    ACE_TRACE(( ACE_TEXT("GadgetCloudController::open") ));
+//
+//    this->reactor(&gt_cloud_rector_);
+//
+//    //if (!this->reactor())
+//    //{
+//    //    ACE_DEBUG((LM_INFO, ACE_TEXT("Setting reactor")));
+//    //    this->reactor(ACE_Reactor::instance());
+//    //}
+//
+//    return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::close(unsigned long flags)
+//{
+//    int rval = 0;
+//    if (flags == 1)
+//    {
+//        ACE_Message_Block *hangup = new ACE_Message_Block();
+//        hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+//        if (this->putq(hangup) == -1) {
+//            hangup->release();
+//            ACE_ERROR_RETURN( (LM_ERROR,
+//                    ACE_TEXT("%p\n"),
+//                    ACE_TEXT("GadgetCloudController::close, putq")),
+//                    -1);
+//        }
+//        rval = this->wait();
+//    }
+//    return rval;
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::createConnector(const CloudType& cloud, 
+//    size_t msgID_reader, std::vector<GadgetMessageReader*>& readers, 
+//    size_t msgID_writer, std::vector<GadgetMessageWriter*>& writers)
+//{
+//    number_of_nodes_ = cloud.size();
+//
+//    if ( readers.size() != number_of_nodes_ ) return -1;
+//    if ( writers.size() != number_of_nodes_ ) return -1;
+//
+//    cloud_connectors_.resize(number_of_nodes_, NULL);
+//    node_status_.resize(number_of_nodes_, -1);
+//
+//    cloud_msg_id_reader_ = msgID_reader;
+//    cloud_msg_id_writer_ = msgID_writer;
+//
+//    unsigned int ii;
+//    for( ii=0; ii<number_of_nodes_; ii++ )
+//    {
+//        GadgetronCloudConnector<JobType>* con;
+//        ACE_NEW_RETURN (con, GadgetronCloudConnector<JobType>, -1);
+//        cloud_connectors_[ii] = con;
+//
+//        cloud_connectors_[ii]->register_reader(cloud_msg_id_reader_, readers[ii] );
+//        cloud_connectors_[ii]->register_writer(cloud_msg_id_writer_, writers[ii] );
+//
+//        cloud_connectors_[ii]->set_cloud_controller(this);
+//    }
+//
+//    return 0;
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::
+//connectToCloud(const CloudType& cloud)
+//{
+//    number_of_nodes_ = cloud.size();
+//    if ( cloud_connectors_.size() != number_of_nodes_ ) return -1;
+//
+//    unsigned int ii;
+//    for( ii=0; ii<number_of_nodes_; ii++ )
+//    {
+//        if ( cloud_connectors_[ii] == NULL ) return -1;
+//
+//        // if ( cloud_connectors_[ii].open(hostnames[ii], port_nos[ii])!=0 )
+//        if ( cloud_connectors_[ii]->open(cloud[ii].get<0>(), cloud[ii].get<1>())!=0 )
+//        {
+//            ACE_DEBUG(( LM_ERROR, ACE_TEXT("(%p) Open connection to %s:%s failed ... \n"), cloud[ii].get<0>().c_str(), cloud[ii].get<1>().c_str()));
+//        }
+//        else
+//        {
+//            node_status_[ii] = 0;
+//
+//            // send the xml file
+//            if (cloud_connectors_[ii]->send_gadgetron_configuration_file(cloud[ii].get<2>()) != 0)
+//            {
+//                ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send XML configuration to the Gadgetron cloud host %s:%s \n"), cloud[ii].get<0>().c_str(), cloud[ii].get<1>().c_str()));
+//                return -1;
+//            }
+//        }
+//    }
+//
+//    bool hasGoodNode = false;
+//    for( ii=0; ii<number_of_nodes_; ii++ )
+//    {
+//        if ( node_status_[ii] == 0 )
+//        {
+//            hasGoodNode = true;
+//            break;
+//        }
+//    }
+//
+//    if ( !hasGoodNode )
+//    {
+//        ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to find even one good node ... \n")));
+//        return -1;
+//    }
+//
+//    return 0;
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::
+//runJobsOnCloud(const std::vector<int>& node_ids)
+//{
+//    ACE_DEBUG((LM_INFO, ACE_TEXT("(%t) GadgetCloudController : into runJobsOnCloud(...) ... \n")));
+//
+//    if ( job_list_.empty() )
+//    {
+//        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list is empty ... \n")));
+//        return -1;
+//    }
+//
+//    if ( completed_job_list_.empty() )
+//    {
+//        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : completed job list is empty ... \n")));
+//        return -1;
+//    }
+//
+//    if ( job_list_.size() != completed_job_list_.size() )
+//    {
+//        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list size does not match ... \n")));
+//        return -1;
+//    }
+//
+//    if ( job_list_.size() != node_ids.size() )
+//    {
+//        ACE_DEBUG((LM_ERROR, ACE_TEXT("GadgetCloudController : job list size does not match the node id size ... \n")));
+//        return -1;
+//    }
+//
+//    std::vector<int> node_ids_used(node_ids);
+//
+//    unsigned int numOfJobs = job_list_.size();
+//    job_status_.resize(numOfJobs, -1);
+//
+//    unsigned int ii;
+//    for( ii=0; ii<numOfJobs; ii++ )
+//    {
+//        int nodeID = node_ids_used[ii];
+//        if ( nodeID == -1 )
+//        {
+//            job_status_[ii] = 0;
+//            continue;
+//        }
+//
+//        if ( nodeID > number_of_nodes_ )
+//        {
+//            nodeID %= number_of_nodes_;
+//        }
+//
+//        while ( node_status_[nodeID] < 0 )
+//        {
+//            nodeID--;
+//            if ( nodeID == 0 ) nodeID = number_of_nodes_;
+//        }
+//
+//        if ( nodeID != node_ids_used[ii] ) node_ids_used[ii] = nodeID;
+//
+//        // send job to a node
+//        GadgetContainerMessage<GadgetMessageIdentifier>* m1 =
+//                new GadgetContainerMessage<GadgetMessageIdentifier>();
+//
+//        m1->getObjectPtr()->id = cloud_msg_id_writer_;
+//
+//        GadgetContainerMessage<int>* m2 =
+//                new GadgetContainerMessage<int>();
+//
+//        *(m2->getObjectPtr()) = ii;
+//
+//        GadgetContainerMessage<JobType>* m3 =
+//                new GadgetContainerMessage<JobType>();
+//
+//        *(m3->getObjectPtr()) = *(job_list_[ii]);
+//        m1->cont(m2);
+//        m2->cont(m3);
+//
+//        if ( node_status_[nodeID] == 0 )
+//        {
+//            if (cloud_connectors_[nodeID]->putq(m1) == -1)
+//            {
+//                ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send job package %d on queue for node %d \n"), ii, nodeID));
+//                return -1;
+//            }
+//            else
+//            {
+//                ACE_DEBUG((LM_INFO, ACE_TEXT("Send job %d to node %d ... \n"), ii, nodeID));
+//            }
+//        }
+//    }
+//
+//    std::vector<bool> closeMsgSent(number_of_nodes_, false);
+//    for( ii=0; ii<numOfJobs; ii++ )
+//    {
+//        unsigned int nodeID = node_ids_used[ii];
+//
+//        if ( !closeMsgSent[nodeID] )
+//        {
+//            closeMsgSent[nodeID] = true;
+//
+//            // send the close message for this node
+//            GadgetContainerMessage<GadgetMessageIdentifier>* m = new GadgetContainerMessage<GadgetMessageIdentifier>();
+//            m->getObjectPtr()->id = GADGET_MESSAGE_CLOSE;
+//
+//            if (cloud_connectors_[nodeID]->putq(m) == -1)
+//            {
+//                ACE_DEBUG((LM_ERROR, ACE_TEXT("Unable to send CLOSE package on queue for node %d \n"), nodeID));
+//                return -1;
+//            }
+//        }
+//    }
+//
+//    ACE_DEBUG((LM_INFO, ACE_TEXT("GadgetCloudController thread - all jobs sent ... \n")));
+//
+//    // block the caller thread
+//    // cond_->wait();
+//
+//    // ACE_DEBUG((LM_INFO, ACE_TEXT("GadgetCloudController thread wakes up ... \n")));
+//
+//    return 0;
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::waitForJobToComplete()
+//{
+//    // block the caller thread
+//    ACE_DEBUG((LM_INFO, ACE_TEXT("(%t) GadgetCloudController thread sleeps ... \n")));
+//    // int ret = cond_->wait();
+//
+//    ACE_Message_Block *mb = 0;
+//    ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+//
+//    //collect a incoming package a package if we have one
+//    while (this->getq (mb) != -1)
+//    {
+//        GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+//            AsContainerMessage<GadgetMessageIdentifier>(mb);
+//
+//        if (!mid)
+//        {
+//            ACE_DEBUG ((LM_ERROR, ACE_TEXT ("Invalid message on GadgetCloudController queue\n")));
+//            mb->release();
+//            cond_->signal();
+//            return -1;
+//        }
+//
+//        //Is this a shutdown message?
+//        if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE)
+//        {
+//            cond_->signal();
+//            return 0;
+//        }
+//
+//        if (mid->getObjectPtr()->id == cloud_msg_id_reader_)
+//        {
+//            GadgetContainerMessage<int>* m_jobID =
+//                AsContainerMessage<int>(mid->cont());
+//
+//            int jobID = *(m_jobID->getObjectPtr());
+//
+//            GadgetContainerMessage<JobType>* job =
+//                AsContainerMessage<JobType>(mid->cont()->cont());
+//
+//            *(completed_job_list_[jobID]) = *(job->getObjectPtr());
+//            job_status_[jobID] = 0;
+//        }
+//
+//        mb->release();
+//
+//        // if all jobs are received, notice the caller thread
+//        bool allJobProcessed = true;
+//        for ( unsigned int ii=0; ii<job_status_.size(); ii++ )
+//        {
+//            if ( job_status_[ii] != 0 )
+//            {
+//                allJobProcessed = false;
+//                break;
+//            }
+//        }
+//
+//        if ( allJobProcessed )
+//        {
+//            ACE_DEBUG ((LM_INFO, ACE_TEXT ("All jobs are completed and returned on GadgetCloudController queue\n")));
+//            break;
+//        }
+//    }
+//
+//    ACE_DEBUG((LM_INFO, ACE_TEXT("(%t) GadgetCloudController thread wakes up ... \n")));
+//    return 0;
+//}
+//
+//template <typename JobType> 
+//int GadgetCloudController<JobType>::svc(void)
+//{
+//    ACE_DEBUG((LM_INFO, ACE_TEXT("(%t) Into GadgetCloudController svc() ... \n")));
+//
+//    this->reactor()->owner(ACE_Thread::self ());//, &old_owner);
+//
+//    this->reactor()->reset_event_loop();
+//
+//    ACE_Time_Value initialDelay (3);
+//    ACE_Time_Value interval (0,100);
+//
+//    //Handle the events
+//    this->reactor()->run_reactor_event_loop();
+//
+//    //this->reactor()->owner(&old_owner);
+//
+//    ACE_DEBUG ((LM_INFO, ACE_TEXT ("(%P|%t) GadgetronConnector svc done...\n")));
+//
+//    //ACE_Message_Block *mb = 0;
+//    //ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+//
+//    ////collect a incoming package a package if we have one
+//    //while (this->getq (mb) != -1)
+//    //{
+//    //    GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+//    //            AsContainerMessage<GadgetMessageIdentifier>(mb);
+//
+//    //    if (!mid)
+//    //    {
+//    //        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("Invalid message on GadgetCloudController queue\n")));
+//    //        mb->release();
+//    //        cond_->signal();
+//    //        return -1;
+//    //    }
+//
+//    //    //Is this a shutdown message?
+//    //    if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE)
+//    //    {
+//    //        cond_->signal();
+//    //        return 0;
+//    //    }
+//
+//    //    if (mid->getObjectPtr()->id == cloud_msg_id_reader_)
+//    //    {
+//    //        GadgetContainerMessage<int>* m_jobID =
+//    //            AsContainerMessage<int>(mid->cont());
+//
+//    //        int jobID = *(m_jobID->getObjectPtr());
+//
+//    //        GadgetContainerMessage<JobType>* job =
+//    //            AsContainerMessage<JobType>(mid->cont()->cont());
+//
+//    //        *(completed_job_list_[jobID]) = *(job->getObjectPtr());
+//    //        job_status_[jobID] = 0;
+//    //    }
+//
+//    //    mb->release();
+//
+//    //    // if all jobs are received, notice the caller thread
+//    //    bool allJobProcessed = true;
+//    //    for ( unsigned int ii=0; ii<job_status_.size(); ii++ )
+//    //    {
+//    //        if ( job_status_[ii] != 0 )
+//    //        {
+//    //            allJobProcessed = false;
+//    //            break;
+//    //        }
+//    //    }
+//
+//    //    if ( allJobProcessed )
+//    //    {
+//    //        ACE_DEBUG ((LM_INFO, ACE_TEXT ("All jobs are completed and returned on GadgetCloudController queue\n")));
+//    //        break;
+//    //    }
+//    //}
+//
+//    //// notice the caller thread
+//    //ACE_DEBUG((LM_INFO, ACE_TEXT("Wake up GadgetCloudController thread ... \n")));
+//
+//    //cond_->signal();
+//
+//    return 0;
+//}
+
+}
+#endif //GADGETSTREAMCONTROLLER_H
diff --git a/toolboxes/gadgettools/GadgetronCloudConnector.h b/toolboxes/gadgettools/GadgetronCloudConnector.h
new file mode 100644
index 0000000..bd50e68
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronCloudConnector.h
@@ -0,0 +1,580 @@
+
+#pragma once
+
+#include <ace/Svc_Handler.h>
+#include <ace/Reactor.h>
+#include <ace/SOCK_Stream.h>
+#include <ace/SOCK_Connector.h>
+#include <ace/Reactor_Notification_Strategy.h>
+#include <string>
+#include "GadgetronSlotContainer.h"
+#include "GadgetMessageInterface.h"
+#include "GadgetronConnector.h"
+#include "gadgettools_export.h"
+#include "GadgetMRIHeaders.h"
+
+#define GADGETRON_TIMEOUT_PERIOD 1.0
+
+namespace Gadgetron
+{
+
+template<typename JobType> class GadgetCloudController;
+template<typename JobType> class GadgetronCloudConnector;
+
+template<typename JobType> 
+class CloudWriterTask : public WriterTask
+{
+
+public:
+    typedef WriterTask inherited;
+
+    CloudWriterTask(ACE_SOCK_Stream* socket)
+    : inherited(socket), cloud_connector_(NULL)
+    {
+    }
+
+    virtual ~CloudWriterTask()
+    {
+    }
+
+    void set_cloud_connector(GadgetronCloudConnector<JobType>* connector)
+    {
+        cloud_connector_ = connector;
+    }
+
+    virtual int svc(void)
+    {
+        ACE_Message_Block* mb = 0;
+        while (this->getq (mb) != -1)
+        {
+            int retval = this->svcImpl(mb);
+
+            if ( retval == 2 )
+            {
+                GADGET_DEBUG1("CloudWriterTask quit\n");
+                return 0;
+            }
+
+            if ( retval == -1 )
+            {
+                GADGET_DEBUG1("CloudWriterTask svcImpl failed ... \n");
+                ACE_OS::sleep(ACE_Time_Value(GADGETRON_TIMEOUT_PERIOD));
+                return -1;
+            }
+        }
+
+        return 0;
+    }
+
+    virtual int svcImpl(ACE_Message_Block* mb)
+    {
+        ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+        //Send a package if we have one
+        GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+                AsContainerMessage<GadgetMessageIdentifier>(mb);
+
+        if (!mid)
+        {
+            ACE_DEBUG ((LM_ERROR, ACE_TEXT ("Invalid message on output queue\n")));
+            mb->release();
+            return -1;
+        }
+
+        //Is this a shutdown message?
+        if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE)
+        {
+            socket_->send_n(mid->getObjectPtr(),sizeof(GadgetMessageIdentifier));
+            GADGET_DEBUG1("CloudWriterTask done\n");
+            return 2;
+        }
+
+        GadgetMessageWriter* w = writers_.find(mid->getObjectPtr()->id);
+
+        if (!w)
+        {
+            ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unrecognized Message ID received: %d\n"),mid->getObjectPtr()->id));
+            mb->release();
+            return -1;
+        }
+
+        if (w->write(socket_,mb->cont()) < 0)
+        {
+            ACE_DEBUG ( (LM_DEBUG, ACE_TEXT ("(%P|%t) Failed to write message to Gadgetron\n")) );
+
+            // notice the controller
+            GadgetContainerMessage<int>* m1 = 
+                dynamic_cast< GadgetContainerMessage<int>* >(mb->cont());
+
+            if ( m1 )
+            {
+                int jobID = *(m1->getObjectPtr());
+                cloud_connector_->setJobTobeCompletedAndNoticeController(jobID);
+            }
+            else
+            {
+                cloud_connector_->setJobTobeCompletedAndNoticeController();
+            }
+
+            mb->release ();
+            return -1;
+        }
+
+        mb->release();
+
+        GADGET_DEBUG1("--> CloudWriterTask, write msg through socket done ... \n");
+
+        return 0;
+    }
+
+protected:
+    GadgetronCloudConnector<JobType>* cloud_connector_;
+};
+
+template<typename JobType> 
+class CloudReaderTask : public ACE_Task<ACE_MT_SYNCH>
+{
+
+public:
+    typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+    CloudReaderTask(ACE_SOCK_Stream* socket) : inherited(), socket_(socket), cloud_connector_(NULL)
+    {
+    }
+
+    virtual ~CloudReaderTask()
+    {
+        readers_.clear();
+    }
+
+    virtual int init(void)
+    {
+        ACE_TRACE(( ACE_TEXT("CloudReaderTask::init") ));
+        return 0;
+    }
+
+    virtual int open(void* = 0)
+    {
+        GADGET_DEBUG1("CloudReaderTask::open\n");
+        return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+    }
+
+    void set_cloud_connector(GadgetronCloudConnector<JobType>* connector)
+    {
+        cloud_connector_ = connector;
+    }
+
+    int register_reader(size_t slot, GadgetMessageReader* reader)
+    {
+        return readers_.insert(slot,reader);
+    }
+
+    virtual int close(unsigned long flags)
+    {
+        GADGET_DEBUG1("CloudReaderTask::close\n");
+        int rval = 0;
+        if (flags == 1) {
+            /*
+            ACE_Message_Block *hangup = new ACE_Message_Block();
+            hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+            if (this->putq(hangup) == -1) {
+                hangup->release();
+                ACE_ERROR_RETURN( (LM_ERROR,
+                        ACE_TEXT("%p\n"),
+                        ACE_TEXT("WriterTask::close, putq")),
+                        -1);
+            }
+            */
+            rval = this->wait();
+        }
+        return rval;
+    }
+
+    virtual int svc(void)
+    {
+        ssize_t recv_count = 0;
+        GadgetMessageIdentifier mid;
+
+        while (1)
+        {
+            if ((recv_count = cloud_connector_->peer().recv_n(&mid, sizeof(GadgetMessageIdentifier))) <= 0)
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, CloudReaderTask, failed to read message identifier\n")) );
+                ACE_OS::sleep(ACE_Time_Value(GADGETRON_TIMEOUT_PERIOD));
+                cloud_connector_->set_status(false);
+                cloud_connector_->setJobTobeCompletedAndNoticeController();
+                return -1;
+            }
+
+            //Is this a shutdown message?
+            if (mid.id == GADGET_MESSAGE_CLOSE)
+            {
+                ACE_DEBUG( (LM_INFO, ACE_TEXT("%P, %l, CloudReaderTask, Close Message received\n")) );
+                return 0;
+            }
+
+            GadgetMessageReader* r = readers_.find(mid.id);
+            if (r == 0)
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, CloudReaderTask, Unknown message id %d received\n"), mid.id) );
+                cloud_connector_->set_status(false);
+                cloud_connector_->setJobTobeCompletedAndNoticeController();
+                return -1;
+            }
+
+            ACE_Message_Block* mb = r->read(&cloud_connector_->peer());
+
+            if (!mb)
+            {
+                ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, CloudReaderTask, Failed to read message\n")) );
+                ACE_OS::sleep(ACE_Time_Value(GADGETRON_TIMEOUT_PERIOD));
+                cloud_connector_->set_status(false);
+                cloud_connector_->setJobTobeCompletedAndNoticeController();
+                return -1;
+            }
+            else
+            {
+                if (cloud_connector_->process(mid.id, mb) < 0)
+                {
+                    ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, ReaderTask, Failed to process message\n")) );
+                    cloud_connector_->set_status(false);
+                    cloud_connector_->setJobTobeCompletedAndNoticeController();
+                    return -1;
+                }
+            }
+        }
+
+        GADGET_DEBUG1("CloudReaderTask, stop with return value 0 ... \n");
+        return 0;
+    }
+
+protected:
+
+    ACE_SOCK_Stream* socket_;
+    GadgetronSlotContainer<GadgetMessageReader> readers_;
+    GadgetronCloudConnector<JobType>* cloud_connector_;
+};
+
+template<typename JobType> 
+class GadgetronCloudConnector
+{
+public:
+
+    GadgetronCloudConnector();
+    virtual ~GadgetronCloudConnector();
+
+    int openImpl (std::string hostname, std::string port);
+    int open (std::string hostname, std::string port);
+
+    virtual int process(size_t messageid, ACE_Message_Block* mb);
+
+    void set_cloud_controller(GadgetCloudController<JobType>* controller);
+
+    // if jobID==-1, all jobs for this node is set to be completed
+    int setJobTobeCompletedAndNoticeController(int jobID=-1);
+
+    virtual int putq  (  ACE_Message_Block * mb ,  ACE_Time_Value *  timeout = 0);
+
+    virtual int register_reader(size_t slot, GadgetMessageReader* reader);
+    virtual int register_writer(size_t slot, GadgetMessageWriter* writer);
+
+    int close()
+    {
+        GADGET_DEBUG1("Into GadgetronCloudConnector:close() ... \n");
+        GADGET_DEBUG1("Closing socket \n");
+        peer().close();
+        GADGET_DEBUG1("Socket closed \n");
+        cloud_writer_task_.flush();
+        cloud_reader_task_.close(0);
+        cloud_writer_task_.close(0);
+        return this->wait();
+    }
+
+    virtual int wait()
+    {
+        GADGET_DEBUG1("Into GadgetronCloudConnector:wait() ... \n");
+
+        int retval;
+        GADGET_DEBUG1("Waiting for cloud reader task:\n");
+        retval = cloud_reader_task_.wait();
+        GADGET_DEBUG1("Reader task done\n");
+
+        ACE_TRACE(( ACE_TEXT("Waiting for cloud writer task:") ));
+        retval = cloud_writer_task_.wait();
+        ACE_TRACE(( ACE_TEXT("Writer task done:") ));
+
+        return retval;
+    }
+
+    CloudWriterTask<JobType>& writer_task()
+    {
+        return cloud_writer_task_;
+    }
+
+    CloudReaderTask<JobType>& reader_task()
+    {
+        return cloud_reader_task_;
+    }
+
+    bool status()
+     {
+        bool ret_val;
+        mtx_.acquire();
+        ret_val = status_;
+        mtx_.release();
+        return ret_val;
+    }
+
+    void set_status(bool s)
+    {
+        mtx_.acquire();
+        status_ = s;
+        mtx_.release();
+    }
+
+    int send_gadgetron_configuration_file(std::string config_xml_name);
+    int send_gadgetron_configuration_script(std::string config_xml_name);
+    int send_gadgetron_parameters(std::string xml_string);
+
+    ACE_SOCK_Stream& peer()
+    {
+        return peer_;
+    }
+
+    unsigned int nodeID_;
+
+protected:
+
+    ACE_Thread_Mutex mtx_;
+    bool status_;
+
+    std::string hostname_;
+    std::string port_;
+
+    GadgetCloudController<JobType>* cloud_controller_;
+    CloudWriterTask<JobType> cloud_writer_task_;
+    CloudReaderTask<JobType> cloud_reader_task_;
+
+    ACE_SOCK_Stream peer_;
+};
+
+template<typename JobType> 
+GadgetronCloudConnector<JobType>::GadgetronCloudConnector() : cloud_controller_(NULL), 
+                                                            nodeID_(0), 
+                                                            cloud_writer_task_(&this->peer()), 
+                                                            cloud_reader_task_(&this->peer()), 
+                                                            status_(false), 
+                                                            mtx_("CLOUDCONNECTOR_MTX")
+{
+    GADGET_DEBUG1("Into GadgetronCloudConnector:GadgetronCloudConnector() ... \n");
+}
+
+template<typename JobType> 
+GadgetronCloudConnector<JobType>::~GadgetronCloudConnector()
+{
+    GADGET_DEBUG1("Into GadgetronCloudConnector:~GadgetronCloudConnector() ... \n");
+    cloud_writer_task_.msg_queue()->deactivate();
+    cloud_reader_task_.msg_queue()->deactivate();
+    this->wait();
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::openImpl(std::string hostname, std::string port)
+{
+    hostname_= hostname;
+    port_ = port;
+
+    ACE_INET_Addr server(port_.c_str(),hostname_.c_str());
+    ACE_SOCK_Connector connector;
+
+    if (connector.connect(this->peer(),server) == -1)
+    {
+        ACE_ERROR_RETURN(( LM_ERROR, ACE_TEXT("%p\n"), ACE_TEXT("connect")), -1);
+    }
+
+    ACE_TCHAR peer_name[MAXHOSTNAMELENGTH];
+    ACE_INET_Addr peer_addr;
+    if (peer().get_remote_addr (peer_addr) == 0 && peer_addr.addr_to_string (peer_name, MAXHOSTNAMELENGTH) == 0)
+    {
+        ACE_DEBUG ((LM_DEBUG, ACE_TEXT ("(%P|%t) Connection from %s\n"), peer_name));
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::open(std::string hostname, std::string port)
+{
+    this->cloud_writer_task_.set_cloud_connector(this);
+    this->cloud_reader_task_.set_cloud_connector(this);
+
+    if ( this->openImpl(hostname, port) == 0 )
+    {
+        status_ = true;
+        this->cloud_writer_task_.open();
+        this->cloud_reader_task_.open();
+    }
+    else
+    {
+        status_ = false;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::process(size_t messageid, ACE_Message_Block* mb)
+{
+    // insert message into the queue of cloud controller
+    if ( cloud_controller_ == NULL )
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) GadgetronCloudConnector, pointer of could controller is null ...\n")));
+        return -1;
+    }
+
+    cloud_controller_->putq(mb);
+
+    return 0;
+}
+
+template<typename JobType> 
+void GadgetronCloudConnector<JobType>::set_cloud_controller(GadgetCloudController<JobType>* controller)
+{
+    cloud_controller_ = controller;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::putq(ACE_Message_Block* mb ,  ACE_Time_Value* timeout)
+{
+    return cloud_writer_task_.putq(mb,timeout);
+    /*int retval = cloud_writer_task_.svcImpl(mb);
+    if ( retval != 0 )
+    {
+        ACE_Time_Value tv(GADGETRON_TIMEOUT_PERIOD);
+        ACE_OS::sleep(tv);
+    }
+    return retval;*/
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::register_reader(size_t slot, GadgetMessageReader* reader)
+{
+    return cloud_reader_task_.register_reader(slot, reader);
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::register_writer(size_t slot, GadgetMessageWriter* writer)
+{
+    return cloud_writer_task_.register_writer(slot,writer);
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::setJobTobeCompletedAndNoticeController(int jobID)
+{
+    ACE_GUARD_RETURN(ACE_Thread_Mutex, guard, mtx_, -1);
+
+    ACE_DEBUG( (LM_INFO, ACE_TEXT("%P, %l, GadgetronCloudConnector, into setJobTobeCompletedAndNoticeController(...) ... \n")) );
+
+    if ( cloud_controller_->setJobsTobeCompleted(nodeID_, jobID) < 0 )
+    {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetronCloudConnector, cloud_controller_->setJobsTobeCompleted(%d, %d) failed ... \n"), nodeID_, jobID) );
+        return -1;
+    }
+
+    // put a invalid jobID==-1 to the controller message queue to trick the check
+    GadgetContainerMessage<int>* jobIDMsg = new GadgetContainerMessage<int>();
+    *(jobIDMsg->getObjectPtr()) = -1;
+
+    if (process(GADGET_MESSAGE_CLOUD_JOB, jobIDMsg) < 0)
+    {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetronCloudConnector, Failed to put jobIDMsg==-1 into the controller message queue\n")) );
+        return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::send_gadgetron_configuration_file(std::string config_xml_name)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_FILE;
+
+    GadgetMessageConfigurationFile ini;
+    ACE_OS_String::strncpy(ini.configuration_file, config_xml_name.c_str(),1024);
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier))
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageIdentifier\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageConfigurationFile)) != sizeof(GadgetMessageConfigurationFile))
+     {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageConfigurationFile\n")));
+        return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::send_gadgetron_configuration_script(std::string config_xml)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_SCRIPT;
+
+    GadgetMessageScript ini;
+    ini.script_length = config_xml.size()+1;
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier))
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageIdentifier\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript))
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageScript\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(config_xml.c_str(), ini.script_length) != ini.script_length)
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send parameter xml\n")));
+        return -1;
+    }
+
+    return 0;
+}
+
+template<typename JobType> 
+int GadgetronCloudConnector<JobType>::send_gadgetron_parameters(std::string xml_string)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_PARAMETER_SCRIPT;
+
+    GadgetMessageScript conf;
+    conf.script_length = xml_string.size()+1;
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier))
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageIdentifier\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(&conf, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript))
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageScript\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(xml_string.c_str(), conf.script_length) != conf.script_length)
+    {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send parameter xml\n")));
+        return -1;
+    }
+
+    return 0;
+}
+
+}
diff --git a/toolboxes/gadgettools/GadgetronConnector.cpp b/toolboxes/gadgettools/GadgetronConnector.cpp
new file mode 100644
index 0000000..1c7ecd8
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronConnector.cpp
@@ -0,0 +1,306 @@
+/*
+ * GadgetronConnector.cpp
+ *
+ *  Created on: Nov 1, 2011
+ *      Author: hansenms
+ */
+
+#include <ace/SOCK_Connector.h>
+#include "GadgetronConnector.h"
+
+#include <iostream>
+
+using namespace Gadgetron;
+GadgetronConnector::GadgetronConnector()
+    //: notifier_ (0, this, ACE_Event_Handler::WRITE_MASK)
+    : writer_task_(&this->peer())
+{
+
+}
+
+GadgetronConnector::~GadgetronConnector() {
+    readers_.clear();
+    //writers_.clear();
+}
+
+int GadgetronConnector::openImpl(std::string hostname, std::string port)
+{
+    hostname_= hostname;
+    port_ = port;
+
+    //We will add a notification strategy to the message queue to make sure than handle_output gets triggered when packages are on the queue
+    //this->notifier_.reactor (this->reactor ());
+    //this->msg_queue ()->notification_strategy (&this->notifier_);
+
+    ACE_INET_Addr server(port_.c_str(),hostname_.c_str());
+    ACE_SOCK_Connector connector;
+
+    if (connector.connect(this->peer(),server) == -1) {
+        ACE_ERROR_RETURN(( LM_ERROR, ACE_TEXT("%p\n"), ACE_TEXT("connect")), -1);
+    }
+
+    ACE_TCHAR peer_name[MAXHOSTNAMELENGTH];
+    ACE_INET_Addr peer_addr;
+    if (peer().get_remote_addr (peer_addr) == 0 && peer_addr.addr_to_string (peer_name, MAXHOSTNAMELENGTH) == 0) {
+        ACE_DEBUG ((LM_DEBUG, ACE_TEXT ("(%P|%t) Connection from %s\n"), peer_name));
+    }
+
+    //if (this->reactor ()->register_handler(this, ACE_Event_Handler::READ_MASK) != 0) {
+    //    ACE_ERROR_RETURN(( LM_ERROR, ACE_TEXT("%p\n"), ACE_TEXT("Registering read handler")), -2);
+    //}
+
+    //this->msg_queue ()->notification_strategy (0);
+    return 0;
+}
+
+int GadgetronConnector::open(std::string hostname, std::string port)
+{
+    //Make sure we have a reactor, otherwise assign one from the singleton instance
+    if (!this->reactor()) {
+        ACE_DEBUG((LM_INFO, ACE_TEXT("Setting reactor")));
+        this->reactor(ACE_Reactor::instance());
+    }
+
+    this->openImpl(hostname, port);
+
+    this->writer_task_.open();
+
+    if (this->reactor ()->register_handler(this, ACE_Event_Handler::READ_MASK) != 0) {
+        ACE_ERROR_RETURN(( LM_ERROR, ACE_TEXT("%p\n"), ACE_TEXT("Registering read handler")), -2);
+    }
+
+    return this->activate( THR_NEW_LWP | THR_JOINABLE, 1); //Run single threaded. TODO: Add multithreaded support
+}
+
+int GadgetronConnector::handle_input(ACE_HANDLE fd)
+{
+    ssize_t recv_count = 0;
+    GadgetMessageIdentifier mid;
+
+    if ((recv_count = peer().recv_n(&mid, sizeof(GadgetMessageIdentifier))) <= 0) {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetronConnector, failed to read message identifier\n")) );
+        return -1;
+    }
+
+    //Is this a shutdown message?
+    if (mid.id == GADGET_MESSAGE_CLOSE) {
+        ACE_DEBUG( (LM_INFO, ACE_TEXT("%P, %l, GadgetronConnector, Close Message received\n")) );
+        return close();
+    }
+
+    GadgetMessageReader* r = readers_.find(mid.id);
+    if (r == 0) {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetronConnector, Unknown message id %d received\n"), mid.id) );
+        return -1;
+    }
+
+    ACE_Message_Block* mb = r->read(&peer());
+
+    if (!mb) {
+        ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetronConnector, Failed to read message\n")) );
+        return -1;
+    }    else {
+        if (process(mid.id, mb) < 0) {
+            ACE_DEBUG( (LM_ERROR, ACE_TEXT("%P, %l, GadgetronConnector, Failed to process message\n")) );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+int GadgetronConnector::handle_output(ACE_HANDLE fd)
+{
+    ACE_Message_Block *mb = 0;
+    ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+    static int counter = 0;
+    ACE_DEBUG( (LM_INFO, ACE_TEXT("%P, %l, GadgetronConnector, Handle output called, %d\n"), counter++) );
+
+    //Send a package if we have one
+    while (-1 != this->getq (mb, &nowait)) {
+        GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+                AsContainerMessage<GadgetMessageIdentifier>(mb);
+
+
+        if (!mid) {
+            ACE_DEBUG ((LM_ERROR, ACE_TEXT ("Invalid message on output queue\n")));
+            mb->release();
+            return -1;
+        }
+
+        //Is this a shutdown message?
+        if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE) {
+            peer().send_n(mid->getObjectPtr(),sizeof(GadgetMessageIdentifier));
+            return 0;
+        }
+
+
+        GadgetMessageWriter* w = writers_.find(mid->getObjectPtr()->id);
+
+        if (!w) {
+            ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unrecognized Message ID received: %d\n"),mid->getObjectPtr()->id));
+            return -1;
+        }
+
+        if (w->write(&peer(),mb->cont()) < 0) {
+            if (errno == EWOULDBLOCK)
+             {
+            ACE_DEBUG ( (LM_DEBUG, ACE_TEXT ("(%P|%t) Failed to write message to Gadgetron (WOULDBLOCK)\n")) );
+            mb->release ();
+            return 0;
+
+            }
+
+            ACE_DEBUG ( (LM_DEBUG, ACE_TEXT ("(%P|%t) Failed to write message to Gadgetron\n")) );
+            mb->release ();
+            return -1;
+        }
+
+        mb->release();
+    }
+
+    if (this->msg_queue ()->is_empty ()) {
+        ACE_DEBUG( (LM_INFO, ACE_TEXT("%P, %l, GadgetronConnector, Q empty, %d\n"), counter++) );
+        //No point in coming back to handle_ouput until something is put on the queue,
+        //in which case, the msg queue's notification strategy will tell us
+
+        //Stop the WRITE trigger from the socket
+        this->reactor ()->cancel_wakeup(this, ACE_Event_Handler::WRITE_MASK);
+
+        //Get a trigger when stuff is on the queue instead
+        this->msg_queue ()->notification_strategy (&this->notifier_);
+    } else {
+        ACE_DEBUG( (LM_INFO, ACE_TEXT("%P, %l, GadgetronConnector, Q has stuff, %d\n"), counter++) );
+        //There is still more on the queue, let's come back when idle
+
+        //Make sure that we get a wake up when it is possible to write
+        //this->reactor ()->schedule_wakeup(this, ACE_Event_Handler::WRITE_MASK);
+
+        //Don't wake up from the queue, it may not be possible to write.
+        this->msg_queue ()->notification_strategy (0);
+
+        //this->reactor ()->cancel_wakeup(this->notifier_.event_handler(), ACE_Event_Handler::WRITE_MASK);
+    }
+
+    return 0;
+}
+*/
+
+int GadgetronConnector::handle_close(ACE_HANDLE handle, ACE_Reactor_Mask close_mask)
+{
+    ACE_DEBUG ((LM_INFO, ACE_TEXT ("(%P|%t) Handling close...\n")));
+    this->reactor()->end_reactor_event_loop();
+    return 0;//this->wait();
+}
+
+
+int GadgetronConnector::svc(void)
+{
+    //ACE_thread_t old_owner;
+
+    //Take ownership of Reactor
+    this->reactor()->owner(ACE_Thread::self ());//, &old_owner);
+
+    this->reactor()->reset_event_loop();
+
+    ACE_Time_Value initialDelay (3);
+    ACE_Time_Value interval (0,100);
+
+    //Handle the events
+    this->reactor()->run_reactor_event_loop();
+
+    //this->reactor()->owner(&old_owner);
+
+    ACE_DEBUG ((LM_INFO, ACE_TEXT ("(%P|%t) GadgetronConnector svc done...\n")));
+
+    return 0;
+}
+
+int GadgetronConnector::register_reader(size_t slot, GadgetMessageReader *reader)
+{
+    return readers_.insert(slot,reader);
+}
+
+/*
+int GadgetronConnector::register_writer(size_t slot, GadgetMessageWriter *writer)
+{
+    return writers_.insert(slot,writer);
+}
+*/
+
+
+int GadgetronConnector::send_gadgetron_configuration_file(std::string config_xml_name)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_FILE;
+
+    GadgetMessageConfigurationFile ini;
+    ACE_OS_String::strncpy(ini.configuration_file, config_xml_name.c_str(),1024);
+
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier)) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageIdentifier\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageConfigurationFile)) != sizeof(GadgetMessageConfigurationFile)) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageConfigurationFile\n")));
+        return -1;
+    }
+
+    return 0;
+}
+
+int GadgetronConnector::send_gadgetron_configuration_script(std::string config_xml)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_CONFIG_SCRIPT;
+
+    GadgetMessageScript ini;
+    ini.script_length = config_xml.size()+1;
+
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier)) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageIdentifier\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(&ini, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript)) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageScript\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(config_xml.c_str(), ini.script_length) != ini.script_length) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send parameter xml\n")));
+        return -1;
+    }
+
+    return 0;
+}
+
+int GadgetronConnector::send_gadgetron_parameters(std::string xml_string)
+{
+    GadgetMessageIdentifier id;
+    id.id = GADGET_MESSAGE_PARAMETER_SCRIPT;
+
+    GadgetMessageScript conf;
+    conf.script_length = xml_string.size()+1;
+    if (this->peer().send_n(&id, sizeof(GadgetMessageIdentifier)) != sizeof(GadgetMessageIdentifier)) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageIdentifier\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(&conf, sizeof(GadgetMessageScript)) != sizeof(GadgetMessageScript)) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send GadgetMessageScript\n")));
+        return -1;
+    }
+
+    if (this->peer().send_n(xml_string.c_str(), conf.script_length) != conf.script_length) {
+        ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unable to send parameter xml\n")));
+        return -1;
+    }
+
+    return 0;
+}
+
diff --git a/toolboxes/gadgettools/GadgetronConnector.h b/toolboxes/gadgettools/GadgetronConnector.h
new file mode 100644
index 0000000..9b0b291
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronConnector.h
@@ -0,0 +1,172 @@
+/*
+ * GadgetronConnector.h
+ *
+ *  Created on: Nov 1, 2011
+ *      Author: Michael S. Hansen
+ */
+
+#ifndef GADGETRONCONNECTOR_H_
+#define GADGETRONCONNECTOR_H_
+
+#include <ace/Svc_Handler.h>
+#include <ace/Reactor.h>
+#include <ace/SOCK_Stream.h>
+#include <ace/Reactor_Notification_Strategy.h>
+#include <string>
+#include "GadgetronSlotContainer.h"
+#include "GadgetMessageInterface.h"
+#include "gadgettools_export.h"
+
+#define MAXHOSTNAMELENGTH 1024
+
+namespace Gadgetron{
+class WriterTask : public ACE_Task<ACE_MT_SYNCH>
+{
+
+public:
+	typedef ACE_Task<ACE_MT_SYNCH> inherited;
+
+	WriterTask(ACE_SOCK_Stream* socket)
+	: inherited()
+	, socket_(socket)
+	{
+	}
+
+	virtual ~WriterTask()
+	{
+	  writers_.clear();
+	}
+
+	virtual int init(void)
+	{
+	  ACE_TRACE(( ACE_TEXT("WriterTask::init") ));
+	  return 0;
+	}
+
+	virtual int open(void* = 0)
+	{
+	  ACE_TRACE(( ACE_TEXT("WriterTask::open") ));
+      this->msg_queue()->high_water_mark(24.0*1024*1024*1024);
+
+	  return this->activate( THR_NEW_LWP | THR_JOINABLE, 1 );
+	}
+
+
+	int register_writer(size_t slot, GadgetMessageWriter* writer) {
+		return writers_.insert(slot,writer);
+	}
+
+
+	virtual int close(unsigned long flags)
+	{
+		int rval = 0;
+		if (flags == 1) {
+			ACE_Message_Block *hangup = new ACE_Message_Block();
+			hangup->msg_type( ACE_Message_Block::MB_HANGUP );
+			if (this->putq(hangup) == -1) {
+				hangup->release();
+				ACE_ERROR_RETURN( (LM_ERROR,
+						ACE_TEXT("%p\n"),
+						ACE_TEXT("WriterTask::close, putq")),
+						-1);
+			}
+			rval = this->wait();
+		}
+		return rval;
+	}
+
+	virtual int svc(void)
+	{
+		ACE_Message_Block *mb = 0;
+		ACE_Time_Value nowait (ACE_OS::gettimeofday ());
+
+
+		//Send a package if we have one
+		while (this->getq (mb) != -1) {
+			GadgetContainerMessage<GadgetMessageIdentifier>* mid =
+					AsContainerMessage<GadgetMessageIdentifier>(mb);
+
+
+			if (!mid) {
+				ACE_DEBUG ((LM_ERROR, ACE_TEXT ("Invalid message on output queue\n")));
+				mb->release();
+				return -1;
+			}
+
+			//Is this a shutdown message?
+			if (mid->getObjectPtr()->id == GADGET_MESSAGE_CLOSE) {
+				socket_->send_n(mid->getObjectPtr(),sizeof(GadgetMessageIdentifier));
+				return 0;
+			}
+
+			GadgetMessageWriter* w = writers_.find(mid->getObjectPtr()->id);
+
+			if (!w) {
+				ACE_DEBUG ((LM_ERROR, ACE_TEXT ("(%P|%t) Unrecognized Message ID received: %d\n"),mid->getObjectPtr()->id));
+				mb->release();
+				return -1;
+			}
+
+			if (w->write(socket_,mb->cont()) < 0) {
+				ACE_DEBUG ( (LM_DEBUG, ACE_TEXT ("(%P|%t) Failed to write message to Gadgetron\n")) );
+				mb->release ();
+				return -1;
+			}
+
+			mb->release();
+		}
+
+		return 0;
+
+	}
+
+
+protected:
+	ACE_SOCK_Stream* socket_;
+	GadgetronSlotContainer<GadgetMessageWriter> writers_;
+};
+
+class EXPORTGADGETTOOLS GadgetronConnector: public ACE_Svc_Handler<ACE_SOCK_STREAM, ACE_MT_SYNCH> {
+
+public:
+	GadgetronConnector();
+	virtual ~GadgetronConnector();
+
+    int openImpl (std::string hostname, std::string port);
+	int open (std::string hostname, std::string port);
+	virtual int handle_input (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+	//virtual int handle_output (ACE_HANDLE fd = ACE_INVALID_HANDLE);
+	virtual int handle_close (ACE_HANDLE handle, ACE_Reactor_Mask close_mask);
+	virtual int svc(void);
+
+	virtual int putq  (  ACE_Message_Block * mb ,  ACE_Time_Value *  timeout = 0) {
+		return writer_task_.putq(mb,timeout);
+	}
+
+	virtual int process(size_t messageid, ACE_Message_Block* mb) {
+		mb->release();
+		return 0;
+	}
+
+	virtual int register_reader(size_t slot, GadgetMessageReader* reader);
+	virtual int register_writer(size_t slot, GadgetMessageWriter* writer) {
+		return writer_task_.register_writer(slot,writer);
+	}
+
+	int send_gadgetron_configuration_file(std::string config_xml_name);
+	int send_gadgetron_configuration_script(std::string config_xml_name);
+	int send_gadgetron_parameters(std::string xml_string);
+
+protected:
+	//ACE_Reactor_Notification_Strategy notifier_;
+	std::string hostname_;
+	std::string port_;
+
+	GadgetronSlotContainer<GadgetMessageReader> readers_;
+	WriterTask writer_task_;
+	//GadgetronSlotContainer<GadgetMessageWriter> writers_;
+
+};
+
+}
+#endif /* GADGETRONCONNECTOR_H_ */
diff --git a/toolboxes/gadgettools/GadgetronSlotContainer.h b/toolboxes/gadgettools/GadgetronSlotContainer.h
new file mode 100644
index 0000000..a68c596
--- /dev/null
+++ b/toolboxes/gadgettools/GadgetronSlotContainer.h
@@ -0,0 +1,60 @@
+/*
+ * GadgetronSlotContainer.h
+ *
+ *  Created on: Nov 1, 2011
+ *      Author: hansenms
+ */
+
+#ifndef GADGETRONSLOTCONTAINER_H_
+#define GADGETRONSLOTCONTAINER_H_
+
+#include <algorithm>
+#include <vector>
+
+template <typename T> class GadgetronSlotContainer {
+
+public:
+	GadgetronSlotContainer() {}
+
+	virtual ~GadgetronSlotContainer()
+	{
+		clear();
+	}
+
+	T* find(unsigned int slot) {
+	    T* ret = 0;
+	    for (unsigned int i = 0; i < slots_.size(); i++) {
+	    	if (slots_[i] == slot) {
+	    		ret = items_[i];
+	    		break;
+	    	}
+	    }
+	    return ret;
+	  }
+
+	  int insert ( unsigned short slot, T* item) {
+		  if (this->find(slot)) {
+			  return -1;
+		  } else {
+			  slots_.push_back(slot);
+			  items_.push_back(item);
+		  }
+		  return 0;
+	  }
+
+	  int clear()
+	  {
+		  for (unsigned int i = 0; i < items_.size(); i++) {
+			  if (items_[i]) delete items_[i];
+		  }
+		  slots_.clear();
+		  items_.clear();
+		  return 0;
+	  }
+
+protected:
+	std::vector<unsigned int> slots_;
+	std::vector<T*> items_;
+};
+
+#endif /* GADGETRONSLOTCONTAINER_H_ */
diff --git a/toolboxes/gadgettools/demo.xml b/toolboxes/gadgettools/demo.xml
new file mode 100644
index 0000000..0043c9c
--- /dev/null
+++ b/toolboxes/gadgettools/demo.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+<gadgetron>
+  <blah>
+
+  </blah>
+
+  <encoding>
+    <kspace>
+      <matrix_size>
+	<comment>Acquired matrix size</comment>
+	<value>256.9078</value>
+	<value>128</value>
+      </matrix_size>
+    </kspace>
+
+    <image>
+      <matrix_size>
+	<comment>This is the reconstructed matrix size</comment>
+	<value>128</value>
+	<value>128</value>
+      </matrix_size>
+    </image>
+
+  </encoding>
+
+  <hardware>
+    <gradients>
+      <strength>
+	<comment>This is the maximum gradient stregnth</comment>
+	<type>double</type>
+	<units>mT/m</units>
+	<value>40.0</value>
+      </strength>
+      
+      <slew_rate>
+	<units>T/m/s</units>
+	<value>100.0</value>
+      </slew_rate>
+    </gradients>
+  </hardware>
+
+
+</gadgetron>
\ No newline at end of file
diff --git a/toolboxes/gadgettools/gadgettools_export.h b/toolboxes/gadgettools/gadgettools_export.h
new file mode 100644
index 0000000..7142bd7
--- /dev/null
+++ b/toolboxes/gadgettools/gadgettools_export.h
@@ -0,0 +1,20 @@
+/** \file gadgettools_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GADGETTOOLS_EXPORT_H_
+#define GADGETTOOLS_EXPORT_H_
+
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GADGETTOOLS__) || defined (gadgettools_EXPORTS)
+#define EXPORTGADGETTOOLS __declspec(dllexport)
+#else
+#define EXPORTGADGETTOOLS __declspec(dllimport)
+#endif
+#else
+#define EXPORTGADGETTOOLS
+#endif
+
+
+#endif /* GADGETTOOLS_EXPORT_H_ */
diff --git a/toolboxes/gadgettools/schema/gadgetron.xsd b/toolboxes/gadgettools/schema/gadgetron.xsd
new file mode 100644
index 0000000..cfe50ca
--- /dev/null
+++ b/toolboxes/gadgettools/schema/gadgetron.xsd
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<xs:schema xmlns="http://gadgetron.sf.net/gadgetron" xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" targetNamespace="http://gadgetron.sf.net/gadgetron">
+
+  <xs:element name="gadgetronConfiguration">
+    <xs:complexType>
+      <xs:sequence>
+                <xs:element name="port" type="xs:string"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+
+  <xs:element name="gadgetronStreamConfiguration">
+    <xs:complexType>
+      <xs:sequence>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="reader">
+                	<xs:complexType>
+					      <xs:sequence>
+					      	<xs:element name="slot" type="xs:unsignedShort"/>
+					      	<xs:element name="dll" type="xs:string"/>
+					      	<xs:element name="classname" type="xs:string"/>
+					      </xs:sequence>
+          			</xs:complexType>
+        		</xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="writer">
+                	<xs:complexType>
+					      <xs:sequence>
+					      	<xs:element maxOccurs="1" minOccurs="1" name="slot" type="xs:unsignedShort"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+					      </xs:sequence>
+          			</xs:complexType>
+        		</xs:element>
+                <xs:element maxOccurs="unbounded" minOccurs="0" name="gadget">
+                	<xs:complexType>
+					      <xs:sequence>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="name" type="xs:string"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="dll" type="xs:string"/>
+					      	<xs:element maxOccurs="1" minOccurs="1"  name="classname" type="xs:string"/>
+					      	<xs:element maxOccurs="unbounded" minOccurs="0" name="property">
+					      		<xs:complexType>
+					      			<xs:sequence>
+								      	<xs:element maxOccurs="1" minOccurs="1" name="name" type="xs:string"/>
+								      	<xs:element maxOccurs="1" minOccurs="1" name="value" type="xs:string"/>
+					      			</xs:sequence>		
+					      		</xs:complexType>
+              				</xs:element>
+           				  </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+</xs:schema>
diff --git a/toolboxes/gadgettools/test_gadget_xml.cpp b/toolboxes/gadgettools/test_gadget_xml.cpp
new file mode 100644
index 0000000..d52f7f1
--- /dev/null
+++ b/toolboxes/gadgettools/test_gadget_xml.cpp
@@ -0,0 +1,32 @@
+#include "GadgetXml.h"
+
+#include <iostream>
+
+int main(int argc, char** argv)
+{
+  std::cout << "GadgetXML Test Program" << std::endl;
+
+  TiXmlDocument doc( "demo.xml" );
+  doc.LoadFile();
+
+  GadgetXMLNode n(&doc);
+
+  std::vector<long> vals = n.get<long>(std::string("gadgetron.encoding.kspace.matrix_size.value"));
+
+  std::cout << "Number of values: " << vals.size() << std::endl;
+  for (unsigned int i = 0; i < vals.size(); i++) {
+    std::cout << "   :" << vals[i] << std::endl;
+  }
+
+
+  //Let's add something to the document
+  n.add(std::string("gadgetron.encoding.mysection.value"), 6.789);
+  n.add(std::string("gadgetron.encoding.mysection.value"), 612);
+  n.add(std::string("gadgetron.encoding.mysection.value"), 512);
+  n.add(std::string("gadgetron.encoding.mysection.value"), vals);
+  
+  n.get_document()->Print();
+  
+
+  return 0;
+}
diff --git a/toolboxes/gtplus/CMakeLists.txt b/toolboxes/gtplus/CMakeLists.txt
new file mode 100644
index 0000000..b2dcea3
--- /dev/null
+++ b/toolboxes/gtplus/CMakeLists.txt
@@ -0,0 +1,224 @@
+if ( HAS_64_BIT )
+
+    if (WIN32)
+        ADD_DEFINITIONS(-D__BUILD_GADGETRON_PLUS__)
+    endif (WIN32)
+
+    if(WIN32)
+        link_directories(${Boost_LIBRARY_DIRS})
+    endif(WIN32)
+
+    if (MKL_FOUND)
+        MESSAGE("MKL Found for gtPlus ... ")
+        list(APPEND EXTRA_MKL_LIBRARIES mkl_core)
+        if ( USE_OPENMP )
+            list(APPEND EXTRA_MKL_LIBRARIES mkl_intel_thread)
+        endif ( USE_OPENMP )
+
+        INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+        LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+    endif (MKL_FOUND)
+
+    include_directories(
+        ${ACE_INCLUDE_DIR} 
+        ${Boost_INCLUDE_DIR}
+        ${ISMRMRD_INCLUDE_DIR}
+        ${ISMRMRD_XSD_INCLUDE_DIR}
+        ${XSD_INCLUDE_DIR}
+        ${FFTW3_INCLUDE_DIR}
+        ${ARMADILLO_INCLUDE_DIRS}
+        ${MKL_INCLUDE_DIR}
+        ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators
+        ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+        ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+        ${CMAKE_SOURCE_DIR}/gadgets/core
+        ${HDF5_INCLUDE_DIR}
+        ${HDF5_INCLUDE_DIR}/cpp
+        ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+        ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/util
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/workflow
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/algorithm
+        ${CMAKE_SOURCE_DIR}/toolboxes/gtplus/solver
+        ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools
+        ${CMAKE_SOURCE_DIR}/apps/gadgetron
+        ${CMAKE_SOURCE_DIR}/apps/matlab
+        ${CMAKE_SOURCE_DIR}/gadgets/mri_core 
+        ${CMAKE_SOURCE_DIR}/gadgets/gtPlus 
+    )
+
+    set( util_files util/gtPlusIOBase.h
+        util/gtPlusIOBase.cpp
+        util/gtPlusIOAnalyze.h
+        util/gtPlusIOAnalyze.cpp
+        util/gtPlusMemoryManager.h
+        util/gtPlusMemoryManager.cpp 
+        util/hoNDArrayMemoryManaged.h 
+        util/hoNDArrayMemoryManaged.hxx )
+
+    set( workflow_files workflow/gtPlusISMRMRDReconWorkFlow.h
+        workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
+        workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
+        workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
+        workflow/gtPlusISMRMRDReconUtil.h
+        workflow/gtPlusISMRMRDReconUtil.cpp
+        workflow/gtPlusISMRMRDReconUtil.hxx
+        workflow/gtPlusISMRMRDReconWorkOrder.h
+        workflow/gtPlusISMRMRDReconWorkOrder2DT.h
+        workflow/gtPlusISMRMRDReconWorkOrder3DT.h
+        workflow/gtPlusISMRMRDReconWorker.h
+        workflow/gtPlusISMRMRDReconWorker2DT.h
+        workflow/gtPlusISMRMRDReconWorker3DT.h
+        workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
+        workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h
+        workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
+        workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
+        workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
+        workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h
+        workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
+        workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
+        workflow/gtPlusCloudScheduler.h
+        workflow/gtPlusCloudScheduler.cpp )
+
+    set( algorithm_files algorithm/gtPlusAlgorithmBase.h 
+                        algorithm/gtPlusGRAPPA.h 
+                        algorithm/gtPlusSPIRIT.h
+                        algorithm/gtPlusOperator.h 
+                        algorithm/gtPlusSPIRITOperator.h 
+                        algorithm/gtPlusSPIRIT2DOperator.h 
+                        algorithm/gtPlusSPIRIT3DOperator.h 
+                        algorithm/gtPlusSPIRIT2DTOperator.h 
+                        algorithm/gtPlusSPIRITNoNullSpaceOperator.h 
+                        algorithm/gtPlusSPIRITNoNullSpace2DOperator.h 
+                        algorithm/gtPlusSPIRITNoNullSpace3DOperator.h 
+                        algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h 
+                        algorithm/gtPlusWaveletOperator.h 
+                        algorithm/gtPlusWavelet2DOperator.h 
+                        algorithm/gtPlusWavelet3DOperator.h 
+                        algorithm/gtPlusWaveletNoNullSpace2DOperator.h 
+                        algorithm/gtPlusWaveletNoNullSpace3DOperator.h 
+                        algorithm/gtPlusDataFidelityOperator.h )
+
+    set( solver_files solver/gtPlusSolver.h 
+                        solver/gtPlusLinearSolver.h 
+                        solver/gtPlusNonLinearSolver.h
+                        solver/gtPlusLSQRSolver.h 
+                        solver/gtPlusNCGSolver.h )
+
+    set( gtCloud_files config/gtCloud/myCloud_2DT.txt 
+                    config/gtCloud/myCloud_3DT.txt 
+                    config/gtCloud/myCloud_2DT_DualLayer.txt
+                    config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt )
+
+    set( config_files config/GadgetronProgram_gtPlus_2DT_Cartesian.xml
+                    config/GadgetronProgram_gtPlus_2DT_Cartesian_L1SPIRIT.xml
+                    config/GadgetronProgram_gtPlus_2DT_Cartesian_SPIRIT.xml
+                    config/GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml
+                    config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml
+                    config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml
+                    config/GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml
+                    config/GadgetronProgram_gtPlus_2DT_FatWater.xml
+                    config/GadgetronProgram_gtPlus_2DT_LGE.xml
+                    config/GadgetronProgram_gtPlus_2DT_MOLLI.xml
+                    config/GadgetronProgram_gtPlus_2DT_Perfusion.xml
+                    config/GadgetronProgram_gtPlus_2DT_RealTimeCine.xml
+                    config/GadgetronProgram_gtPlus_2DT_RealTimeFlow.xml
+                    config/GadgetronProgram_gtPlus_2DT_T2W.xml
+                    config/GadgetronProgram_gtPlus_3DT_Cartesian.xml
+                    config/GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml
+                    config/GadgetronProgram_gtPlus_3DT_Cartesian_L1SPIRIT.xml
+                    config/GadgetronProgram_gtPlus_3DT_Cartesian_SPIRIT.xml )
+
+    set( schema_files )
+
+    # matlab
+    if (MATLAB_FOUND)
+        message("MATLAB FOUND: ${MATLAB_INCLUDE_DIR}, Matlab gt interface is being compiled.")
+        SET(CMAKE_DEBUG_POSTFIX)
+        include_directories( ${MATLAB_INCLUDE_DIR} )
+        set( matlab_files matlab/gtMatlabConverter.h
+                          matlab/gtMatlabConverterComplex.h )
+
+    else(MATLAB_FOUND)
+        message("MATLAB NOT FOUND ...")
+        set( matlab_files )
+    endif(MATLAB_FOUND)
+
+    add_library(gtplus ${LIBTYPE} GtPlusExport.h ${util_files} ${workflow_files} ${algorithm_files} ${solver_files} ${config_files} ${schema_files} ${matlab_files})
+
+    source_group(util FILES ${util_files})
+    source_group(workflow FILES ${workflow_files})
+
+    set( config_gtCloud_files ${config_files} ${gtCloud_files} )
+    source_group(config FILES ${config_gtCloud_files})
+
+    source_group(schema FILES ${schema_files})
+    source_group(algorithm FILES ${algorithm_files})
+    source_group(solver FILES ${solver_files})
+
+    if (MATLAB_FOUND)
+        source_group(matlab FILES ${matlab_files})
+    endif(MATLAB_FOUND)
+
+    target_link_libraries(gtplus cpucore cpucore_math ${MKL_LIBRARIES} ${EXTRA_MKL_LIBRARIES})
+
+    if (CUDA_FOUND)
+        target_link_libraries(gtplus gpuparallelmri)
+    endif (CUDA_FOUND)
+
+    install(TARGETS gtplus DESTINATION lib)
+
+    # install gtplus files
+    install (FILES  GtPlusExport.h
+                    util/gtPlusIOBase.h 
+                    util/gtPlusIOAnalyze.h 
+                    util/hoNDArrayMemoryManaged.h 
+                    util/hoNDArrayMemoryManaged.hxx 
+                    util/gtPlusMemoryManager.h 
+                    workflow/gtPlusISMRMRDReconWorkFlow.h
+                    workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
+                    workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
+                    workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
+                    workflow/gtPlusISMRMRDReconUtil.h
+                    workflow/gtPlusISMRMRDReconUtil.hxx
+                    workflow/gtPlusISMRMRDReconWorkOrder.h
+                    workflow/gtPlusISMRMRDReconWorker.h
+                    workflow/gtPlusISMRMRDReconWorker2DT.h
+                    workflow/gtPlusISMRMRDReconWorker3DT.h
+                    workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
+                    workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
+                    workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
+                    workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
+                    algorithm/gtPlusAlgorithmBase.h
+                    algorithm/gtPlusGRAPPA.h
+                    algorithm/gtPlusSPIRIT.h
+                    DESTINATION include)
+
+    # install gadgetron program files
+    install (FILES  ${config_files} 
+                    DESTINATION config)
+
+    install (FILES  ${gtCloud_files} 
+                    DESTINATION config/gtCloud)
+
+    install (FILES  ${schema_files} 
+                    DESTINATION schema)
+
+    if (MATLAB_FOUND)
+        add_subdirectory(matlab)
+    endif(MATLAB_FOUND)
+
+    if (GTEST_FOUND)
+        add_subdirectory(ut)
+    endif (GTEST_FOUND)
+
+endif ( HAS_64_BIT )
diff --git a/toolboxes/gtplus/GtPlusExport.h b/toolboxes/gtplus/GtPlusExport.h
new file mode 100644
index 0000000..423dcab
--- /dev/null
+++ b/toolboxes/gtplus/GtPlusExport.h
@@ -0,0 +1,20 @@
+/** \file       GtPlusExport.h
+    \brief      Implement windows export/import for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#if defined (WIN32)
+    #ifdef BUILD_TOOLBOX_STATIC
+        #define EXPORTGTPLUS 
+    #else
+        #if defined (__BUILD_GADGETRON_PLUS__) || defined (gtplus_EXPORTS)
+            #define EXPORTGTPLUS __declspec(dllexport)
+        #else
+            #define EXPORTGTPLUS __declspec(dllimport)
+        #endif
+    #endif
+#else
+    #define EXPORTGTPLUS
+#endif
diff --git a/toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h b/toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h
new file mode 100644
index 0000000..b782a91
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusAlgorithmBase.h
@@ -0,0 +1,78 @@
+/** \file       gtPlusAlgorithmBase.h
+    \brief      Base class for GtPlus algorithm
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusMemoryManager.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+#ifdef USE_CUDA
+    #include "htgrappa.h"
+#endif // USE_CUDA
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusAlgorithmBase
+{
+public:
+
+    gtPlusAlgorithmBase();
+    virtual ~gtPlusAlgorithmBase();
+
+    virtual void printInfo(std::ostream& os);
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<T> gtPlus_util_complex_;
+
+    // memory manager
+    boost::shared_ptr<gtPlusMemoryManager> gtPlus_mem_manager_;
+};
+
+template <typename T> 
+gtPlusAlgorithmBase<T>::gtPlusAlgorithmBase() : performTiming_(false)
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusAlgorithmBase<T>::~gtPlusAlgorithmBase()
+{
+}
+
+template <typename T> 
+void gtPlusAlgorithmBase<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Algorithm ------------------" << endl;
+    os << "Implementation of algorithms for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusDataFidelityOperator.h b/toolboxes/gtplus/algorithm/gtPlusDataFidelityOperator.h
new file mode 100644
index 0000000..a8d9b83
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusDataFidelityOperator.h
@@ -0,0 +1,160 @@
+/** \file       gtPlusDataFidelityOperator.h
+    \brief      Implement data fidelity operator
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusDataFidelityOperator : public gtPlusOperator<T>
+{
+public:
+
+    typedef gtPlusOperator<T> BaseClass;
+
+    gtPlusDataFidelityOperator();
+    virtual ~gtPlusDataFidelityOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    // D*x
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    // D'x
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // gradient of ||Dx-y||2
+    // 2*D'*(Dx-y)
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // L2 norm of ||Dx-y||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusDataFidelityOperator<T>::gtPlusDataFidelityOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusDataFidelityOperator<T>::~gtPlusDataFidelityOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(acquired_points_indicator_, x, y));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusDataFidelityOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(acquired_points_indicator_, x, y));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusDataFidelityOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // 2D'*(Dx-y)
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(acquired_points_indicator_, x, g));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(g, *acquired_points_, g));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(T(2.0), g));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusDataFidelityOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusDataFidelityOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(acquired_points_indicator_, x, kspace_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(kspace_, *acquired_points_, kspace_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::dotc(kspace_, kspace_, obj));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusDataFidelityOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusDataFidelityOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD data fidelity operator -----------------------" << endl;
+    os << "Data fidelity operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusGRAPPA.h b/toolboxes/gtplus/algorithm/gtPlusGRAPPA.h
new file mode 100644
index 0000000..1bd4cdd
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusGRAPPA.h
@@ -0,0 +1,1035 @@
+
+/** \file   gtPlusGRAPPA.h
+    \brief  GRAPPA implementation for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+
+    References to the implementation can be found in:
+
+    Griswold MA, Jakob PM, Heidemann RM, Nittka M, Jellus V, Wang J, Kiefer B, Haase A. 
+    Generalized autocalibrating partially parallel acquisitions (GRAPPA). 
+    Magnetic Resonance in Medicine 2002;47(6):1202-1210.
+
+    Kellman P, Epstein FH, McVeigh ER. 
+    Adaptive sensitivity encoding incorporating temporal filtering (TSENSE). 
+    Magnetic Resonance in Medicine 2001;45(5):846-852.
+
+    Breuer FA, Kellman P, Griswold MA, Jakob PM. .
+    Dynamic autocalibrated parallel imaging using temporal GRAPPA (TGRAPPA). 
+    Magnetic Resonance in Medicine 2005;53(4):981-985.
+
+    Saybasili H., Kellman P., Griswold MA., Derbyshire JA. Guttman, MA. 
+    HTGRAPPA: Real-time B1-weighted image domain TGRAPPA reconstruction. 
+    Magnetic Resonance in Medicine 2009;61(6): 1425�1433. 
+*/
+
+#pragma once
+
+#include "gtPlusAlgorithmBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusGRAPPA : public gtPlusAlgorithmBase<T>
+{
+public:
+
+    typedef gtPlusAlgorithmBase<T> BaseClass;
+
+    gtPlusGRAPPA() : calib_use_gpu_(true), BaseClass() {}
+    virtual ~gtPlusGRAPPA() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // get the kernel pattern, given the acceleration factor and kernel size
+    bool kerPattern(std::vector<int>& kE1, std::vector<int>& oE1, int accelFactor, size_t kNE1, bool fitItself);
+
+    // grappa calibration for 2D case
+    // acsSrc : [RO E1 srcCHA]
+    // acsDst : [RO E1 dstCHA]
+    // ker : [kRO kE1 srcCHA dstCHA oE1]
+    bool calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+            int kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, ho5DArray<T>& ker);
+
+    // image domain kernel for 2D kernel
+    // kIm: image domain kernel [RO E1 srcCHA dstCHA]
+    bool imageDomainKernel(const ho5DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, int ro, int e1, hoNDArray<T>& kIm);
+
+    // grappa calibration for 3D case
+    // acsSrc : [RO E1 E2 srcCHA]
+    // acsDst : [RO E1 E2 dstCHA]
+    // ker : [kRO kE1 kE2 srcCHA dstCHA oE1 oE2]
+    bool calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, double thres, double overDetermineRatio, 
+            int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, ho7DArray<T>& ker);
+
+    // image domain kernel for 3D kernel
+    // kIm: image domain kernel [RO E1 E2 srcCHA dstCHA]
+    bool imageDomainKernel3D(const ho7DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, int ro, int e1, int e2, hoNDArray<T>& kIm);
+
+    // convert the calibrated kernel to the convlution kernel in kspace
+    // if ROis3rdDim == true, the kernel dimension is [E1 E2 RO], otherwise [RO E1 E2]
+    bool kspaceDomainConvKernel3D(const ho7DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, ho5DArray<T>& convKerFlip, bool ROis3rdDim=true);
+
+    // image domain kernel for 3D kernel, only RO direction is converted to image domain
+    // E1 and E2 stays in the kspace domain
+    // kImRO: kspace-image hybrid kernel [convE1 convE2 RO srcCHA dstCHA]
+    bool imageDomainKernelRO3D(const ho7DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, int ro, hoNDArray<T>& kImRO);
+
+    // image domain kernel for 3D kernel, E1 and E2 directions are converted to image domain
+    // kImRO : kspace-image hybrid kernel where first two dimensions are E1 and E2 and in kspace
+    bool imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, int e1, int e2, hoNDArray<T>& kImE1E2RO);
+
+    // use gpu in the kernel calibration
+    bool calib_use_gpu_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+};
+
+template <typename T> 
+void gtPlusGRAPPA<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD GRAPPA reconstruction ------------------" << endl;
+    os << "Implementation of GRAPPA algorithms for ISMRMRD package" << endl;
+    os << "Both 2D and 3D version are implemented" << endl;
+    os << "Algorithms are published at:" << endl;
+    os << "Generalized autocalibrating partially parallel acquisitions (GRAPPA), Magnetic Resonance in Medicine, Volume 47, Issue 6, pages 1202�1210, June 2002" << endl;
+    os << "HTGRAPPA: Real-time B1-weighted image domain TGRAPPA reconstruction, Magnetic Resonance in Medicine, Volume 61, Issue 6, pages 1425�1433, June 2009" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+kerPattern(std::vector<int>& kE1, std::vector<int>& oE1, int accelFactor, size_t kNE1, bool fitItself)
+{
+    if ( accelFactor == 1 )
+    {
+        kE1.resize(1, 0);
+        oE1.resize(1, 0);
+        return true;
+    }
+
+    kE1.resize(kNE1, 0);
+    if ( kNE1%2 == 0 )
+    {
+        int k;
+        for ( k=-((int)kNE1/2-1); k<=(int)kNE1/2; k++ )
+        {
+            kE1[k+kNE1/2-1] = k*accelFactor;
+        }
+    }
+    else
+    {
+        int k;
+        for ( k=-(int)kNE1/2; k<=(int)kNE1/2; k++ )
+        {
+            kE1[k+kNE1/2] = k*accelFactor;
+        }
+    }
+
+    if ( fitItself )
+    {
+        oE1.resize(accelFactor);
+        for ( int a=0; a<accelFactor; a++ )
+        {
+            oE1[a] = a;
+        }
+    }
+    else
+    {
+        oE1.resize(accelFactor-1);
+        for ( int a=1; a<accelFactor; a++ )
+        {
+            oE1[a-1] = a;
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+    int kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, ho5DArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)>=acsDst.get_size(2));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t srcCHA = acsSrc.get_size(2);
+        size_t dstCHA = acsDst.get_size(2);
+
+        const T* pSrc = acsSrc.begin();
+        const T* pDst = acsDst.begin();
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusGRAPPA<T>::calib(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        size_t kNE1 = kE1.size();
+        size_t oNE1 = oE1.size();
+
+        // allocate kernel
+        GADGET_CHECK_RETURN_FALSE(ker.createArray(kRO, kNE1, srcCHA, dstCHA, oNE1));
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t eRO = RO - kROhalf -1;
+        size_t sE1 = std::abs(kE1[0]);
+        size_t eE1 = E1 -1 - kE1[kNE1-1];
+
+        size_t lenRO = eRO-kROhalf+1;
+
+        size_t rowA = (eE1-sE1+1)*lenRO;
+        size_t colA = kRO*kNE1*srcCHA;
+        size_t colB = dstCHA*oNE1;
+
+        hoMatrix<T> A;
+        hoMatrix<T> B;
+        hoMatrix<T> x( colA, colB );
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 2D calibration - allocate matrix storage ... "));
+        hoNDArrayMemoryManaged<T> A_mem(rowA, colA, gtPlus_mem_manager_);
+        A.createMatrix( rowA, colA, A_mem.begin() );
+
+        hoNDArrayMemoryManaged<T> B_mem(rowA, colB, gtPlus_mem_manager_);
+        B.createMatrix( A.rows(), colB, B_mem.begin() );
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        int e1;
+        for ( e1=(int)sE1; e1<=(int)eE1; e1++ )
+        {
+            for ( int ro=kROhalf; ro<=(int)eRO; ro++ )
+            {
+                int rInd = (e1-sE1)*lenRO+ro-kROhalf;
+
+                size_t src, dst, ke1, oe1;
+                int kro;
+
+                // fill matrix A
+                size_t col = 0;
+                for ( src=0; src<srcCHA; src++ )
+                {
+                    for ( ke1=0; ke1<kNE1; ke1++ )
+                    {
+                        for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            A(rInd, col++) = acsSrc(ro+kro, e1+kE1[ke1], src);
+                        }
+                    }
+                }
+
+                // fill matrix B
+                col = 0;
+                for ( oe1=0; oe1<oNE1; oe1++ )
+                {
+                    for ( dst=0; dst<dstCHA; dst++ )
+                    {
+                        B(rInd, col++) = acsDst(ro, e1+oE1[oe1], dst);
+                    }
+                }
+            }
+        }
+
+        //typename realType<T>::Type v;
+
+        //Gadgetron::norm2(A, v);
+        //GADGET_MSG("A = " << v);
+
+        //Gadgetron::norm2(B, v);
+        //GADGET_MSG("B = " << v);
+
+        //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("SolveLinearSystem_Tikhonov"));
+        #ifdef USE_CUDA
+            // go to device
+            try
+            {
+                if ( typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_ )
+                {
+                    GADGET_MSG("grappa 2D - calling GPU kernel estimation ... ");
+                    hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                    hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                    int ret(0);
+                    boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+
+                    #pragma omp critical(inverse)
+                    {
+                        cuNDArray<float_complext> device_A(A_tmp);
+                        cuNDArray<float_complext> device_B(B_tmp);
+                        cuNDArray<float_complext> device_x;
+
+                        ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+                        if ( ret == 0 )
+                        {
+                            host_x = device_x.to_host();
+                        }
+                    }
+
+                    if ( ret != 0 )
+                    {
+                        GADGET_ERROR_MSG("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                        SolveLinearSystem_Tikhonov(A, B, x, thres);
+                    }
+                    else
+                    {
+                        memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+                    }
+                }
+                else
+                {
+                    GADGET_WARN_MSG("GPU inverse_clib_matrix for grappa is only available for single-precision, calling the CPU version ... ");
+                    GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+                }
+            }
+            catch(...)
+            {
+                GADGET_ERROR_MSG("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+                GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+            }
+
+        #else
+            GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        #endif // USE_CUDA
+
+        // GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+        //Gadgetron::norm2(x, v);
+        //GADGET_MSG("x = " << v);
+
+        // the matrix dimension just matches
+        // hoMatrix<T> xt(x.cols(), x.rows(), ker.begin());
+        // GADGET_CHECK_RETURN_FALSE(Gadgetron::trans(x, xt));
+        memcpy(ker.begin(), x.begin(), ker.get_number_of_bytes());
+
+        //Gadgetron::norm2(ker, v);
+        //GADGET_MSG("ker = " << v);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::calib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernel(const ho5DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& oE1, int ro, int e1, hoNDArray<T>& kIm)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(2));
+        int dstCHA = (int)(ker.get_size(3));
+        int kNE1 = (int)(kE1.size());
+        int oNE1 = (int)(oE1.size());
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusGRAPPA<T>::imageDomainKernel(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        // allocate image domain kernel
+        kIm.create(ro, e1, srcCHA, dstCHA);
+
+        /// fill the convolution kernels
+        int convKRO = 2*kRO+3;
+
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        /// allocate the convolution kernel
+        ho4DArray<T> convKer(convKRO, convKE1, srcCHA, dstCHA);
+        Gadgetron::clear(&convKer);
+
+        /// index
+        int oe1, kro, ke1, src, dst;
+
+        /// fill the convolution kernel and sum up multiple kernels
+        for ( oe1=0; oe1<oNE1; oe1++ )
+        {
+            for ( ke1=0; ke1<kNE1; ke1++ )
+            {
+                for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                {
+                    for ( dst=0; dst<dstCHA; dst++ )
+                    {
+                        for ( src=0; src<srcCHA; src++ )
+                        {
+                            convKer(-kro+kRO+1, oE1[oe1]-kE1[ke1]+maxKE1, src, dst) = ker(kro+kROhalf, ke1, src, dst, oe1);
+                        }
+                    }
+
+                }
+            }
+        }
+
+        if ( (oE1[0]!=0) && (srcCHA==dstCHA) )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                convKer(kRO+1, maxKE1, dst, dst) = 1.0;
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1)) ), convKer ));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad2D(convKer, ro, e1, kIm));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::imageDomainKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, 
+        double thres, double overDetermineRatio, 
+        int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, 
+        const std::vector<int>& oE1, const std::vector<int>& oE2, 
+        ho7DArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)>=acsDst.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(3)>=acsDst.get_size(3));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t E2 = acsSrc.get_size(2);
+        size_t srcCHA = acsSrc.get_size(3);
+        size_t dstCHA = acsDst.get_size(3);
+
+        const T* pSrc = acsSrc.begin();
+        const T* pDst = acsDst.begin();
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusGRAPPA<T>::calib3D(...) - 2*kROhalf == kRO " << kRO);
+        }
+
+        kRO = 2*kROhalf + 1;
+
+        size_t kNE1 = kE1.size();
+        size_t oNE1 = oE1.size();
+
+        size_t kNE2 = kE2.size();
+        size_t oNE2 = oE2.size();
+
+        // allocate kernel
+        GADGET_CHECK_RETURN_FALSE(ker.createArray(kRO, kNE1, kNE2, srcCHA, dstCHA, oNE1, oNE2));
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t sRO = kROhalf;
+        size_t eRO = RO - kROhalf -1;
+
+        size_t sE1 = std::abs(kE1[0]);
+        size_t eE1 = E1 -1 - kE1[kNE1-1];
+
+        size_t sE2 = std::abs(kE2[0]);
+        size_t eE2 = E2 -1 - kE2[kNE2-1];
+
+        size_t lenRO = eRO-kROhalf+1;
+        size_t lenE1 = eE1-sE1+1;
+        size_t lenE2 = eE2-sE2+1;
+
+        size_t colA = kRO*kNE1*kNE2*srcCHA;
+        size_t colB = dstCHA*oNE1*oNE2;
+
+        if ( overDetermineRatio > 1.0 )
+        {
+            size_t maxRowA = std::ceil(overDetermineRatio*colA);
+            size_t maxROUsed = maxRowA/(lenE1*lenE2);
+            if ( maxROUsed < lenRO )
+            {
+                // find the peak signal of acsSrc
+                hoNDArray<T> acsSrc1stCha(RO, E1, E2, const_cast<T*>(acsSrc.begin()));
+                hoNDArray<T> acsSrc1stChaSumE2(RO, E1, 1), acsSrc1stChaSumE2E1(RO, 1, 1);
+
+                if ( Gadgetron::sumOver3rdDimension(acsSrc1stCha, acsSrc1stChaSumE2) )
+                {
+                    if ( Gadgetron::sumOver2ndDimension(acsSrc1stChaSumE2, acsSrc1stChaSumE2E1) )
+                    {
+                        T maxSignal;
+                        size_t roInd;
+                        if ( Gadgetron::maxAbsolute(acsSrc1stChaSumE2E1, maxSignal, roInd) )
+                        {
+                            sRO = roInd - maxROUsed/2;
+                            eRO = sRO + maxROUsed - 1;
+                            lenRO = eRO-sRO+1;
+                            GADGET_MSG("gtPlusGRAPPA<T>::calib3D(...) - overDetermineRatio = " << overDetermineRatio << " ; RO data range used : [" << sRO << " " << eRO << "] ...");
+                        }
+                        else
+                        {
+                            GADGET_WARN_MSG("gtPlusGRAPPA<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                        }
+                    }
+                }
+                else
+                {
+                    GADGET_WARN_MSG("gtPlusGRAPPA<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                }
+            }
+        }
+
+        size_t rowA = lenRO*lenE1*lenE2;
+
+        hoMatrix<T> A, B, x( colA, colB );
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - allocate matrix storage ... "));
+        hoNDArrayMemoryManaged<T> A_mem(rowA, colA, gtPlus_mem_manager_);
+        A.createMatrix( rowA, colA, A_mem.begin() );
+        T* pA = A.begin();
+
+        hoNDArrayMemoryManaged<T> B_mem(rowA, colB, gtPlus_mem_manager_);
+        B.createMatrix( rowA, colB, B_mem.begin() );
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        T* pB = B.begin();
+
+        int e2;
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - fill calib matrixes ... "));
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(e2) shared(sE2, eE2, sE1, eE1, kROhalf, sRO, eRO, lenRO, lenE1, srcCHA, kNE2, kNE1, A, rowA, pA, oNE2, oNE1, dstCHA, B, pB)
+        #else
+            #pragma omp parallel for default(none) private(e2) shared(sE2, eE2, sE1, eE1, kROhalf, sRO, eRO, lenRO, lenE1, srcCHA, kNE2, kNE1, A, rowA, pA, acsSrc, kE1, kE2, oNE2, oNE1, dstCHA, B, pB, acsDst, oE1, oE2)
+        #endif
+        for ( e2=(int)sE2; e2<=(int)eE2; e2++ )
+        {
+            int e1;
+            for ( e1=(int)sE1; e1<=(int)eE1; e1++ )
+            {
+                for ( int ro=(int)sRO; ro<=(int)eRO; ro++ )
+                {
+                    int rInd = (e2-sE2)*lenRO*lenE1 + (e1-sE1)*lenRO + ro-sRO;
+
+                    size_t src, dst, ke1, ke2, oe1, oe2;
+                    int kro;
+
+                    // fill matrix A
+                    size_t col = 0;
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        for ( ke2=0; ke2<kNE2; ke2++ )
+                        {
+                            for ( ke1=0; ke1<kNE1; ke1++ )
+                            {
+                                for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                                {
+                                    // A(rInd, col++) = acsSrc(ro+kro, e1+kE1[ke1], e2+kE2[ke2], src);
+                                    pA[rInd + col*rowA] = acsSrc(ro+kro, e1+kE1[ke1], e2+kE2[ke2], src);
+                                    col++;
+                                }
+                            }
+                        }
+                    }
+
+                    // fill matrix B
+                    col = 0;
+                    for ( oe2=0; oe2<oNE2; oe2++ )
+                    {
+                        for ( oe1=0; oe1<oNE1; oe1++ )
+                        {
+                            for ( dst=0; dst<dstCHA; dst++ )
+                            {
+                                // B(rInd, col++) = acsDst(ro, e1+oE1[oe1], e2+oE2[oe2], dst);
+                                pB[rInd + col*rowA] = acsDst(ro, e1+oE1[oe1], e2+oE2[oe2], dst);
+                                col++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        //typename realType<T>::Type v;
+
+        //Gadgetron::norm2(A, v);
+        //GADGET_MSG("A = " << v);
+
+        //Gadgetron::norm2(B, v);
+        //GADGET_MSG("B = " << v);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - solve linear system ... "));
+        #ifdef USE_CUDA
+            // go to device
+            try
+            {
+                if ( typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_ )
+                {
+                    GADGET_MSG("grappa 3D - calling GPU kernel estimation ... ");
+                    //hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                    //hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                    //cuNDArray<float_complext> device_A(A_tmp);
+                    //cuNDArray<float_complext> device_B(B_tmp);
+                    //cuNDArray<float_complext> device_x;
+                    //if ( Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) != 0 )
+                    //{
+                    //    GADGET_ERROR_MSG("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                    //    SolveLinearSystem_Tikhonov(A, B, x, thres);
+                    //}
+                    //else
+                    //{
+                    //    // go back to host
+                    //    boost::shared_ptr< hoNDArray<complext<float> > > host_x = device_x.to_host();
+                    //    memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+                    //}
+
+                    hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                    hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                    int ret(0);
+                    boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+
+                    #pragma omp critical(inverse3D)
+                    {
+                        cuNDArray<float_complext> device_A(A_tmp);
+                        cuNDArray<float_complext> device_B(B_tmp);
+                        cuNDArray<float_complext> device_x;
+
+                        ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+                        if ( ret == 0 )
+                        {
+                            host_x = device_x.to_host();
+                        }
+                    }
+
+                    if ( ret != 0 )
+                    {
+                        GADGET_ERROR_MSG("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                        SolveLinearSystem_Tikhonov(A, B, x, thres);
+                    }
+                    else
+                    {
+                        memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+                    }
+                }
+                else
+                {
+                    GADGET_WARN_MSG("GPU inverse_clib_matrix for grappa is only available for single-precision, calling the CPU version ... ");
+                    GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+                }
+            }
+            catch(...)
+            {
+                GADGET_ERROR_MSG("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+                GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+            }
+        #else
+            GADGET_CHECK_RETURN_FALSE(SolveLinearSystem_Tikhonov(A, B, x, thres));
+        #endif // USE_CUDA
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        //Gadgetron::norm2(x, v);
+        //GADGET_MSG("x = " << v);
+
+        // the matrix dimension just matches
+        //hoMatrix<T> xt(x.cols(), x.rows(), ker.begin());
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::trans(x, xt));
+        memcpy(ker.begin(), x.begin(), ker.get_number_of_bytes());
+
+        //Gadgetron::norm2(ker, v);
+        //GADGET_MSG("ker = " << v);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::calib3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+kspaceDomainConvKernel3D(const ho7DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, ho5DArray<T>& convKer, bool ROis3rdDim)
+{
+try
+    {
+        int srcCHA = (int)(ker.get_size(3));
+        int dstCHA = (int)(ker.get_size(4));
+
+        int kNE1 = (int)(kE1.size());
+        int oNE1 = (int)(oE1.size());
+
+        int kNE2 = (int)(kE2.size());
+        int oNE2 = (int)(oE2.size());
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusGRAPPA<T>::imageDomainKernel(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - convert to conv kernel ... "));
+        /// fill the convolution kernels
+        int convKRO = 2*kRO+3;
+
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        int maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        int convKE2 = 2*maxKE2+1;
+
+        /// allocate the convolution kernel
+        if ( ROis3rdDim )
+        {
+            convKer.createArray(convKE1, convKE2, convKRO, srcCHA, dstCHA);
+        }
+        else
+        {
+            convKer.createArray(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+        }
+        Gadgetron::clear(&convKer);
+
+        /// index
+        int oe1, oe2, kro, ke1, ke2, src, dst;
+
+        /// fill the convolution kernel and sum up multiple kernels
+        for ( oe2=0; oe2<oNE2; oe2++ )
+        {
+            for ( oe1=0; oe1<oNE1; oe1++ )
+            {
+                for ( ke2=0; ke2<kNE2; ke2++ )
+                {
+                    for ( ke1=0; ke1<kNE1; ke1++ )
+                    {
+                        for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            for ( dst=0; dst<dstCHA; dst++ )
+                            {
+                                if ( ROis3rdDim )
+                                {
+                                    for ( src=0; src<srcCHA; src++ )
+                                    {
+                                        convKer(oE1[oe1]-kE1[ke1]+maxKE1, oE2[oe2]-kE2[ke2]+maxKE2, -kro+kRO+1, src, dst) = ker(kro+kROhalf, ke1, ke2, src, dst, oe1, oe2);
+                                    }
+                                }
+                                else
+                                {
+                                    for ( src=0; src<srcCHA; src++ )
+                                    {
+                                        convKer(-kro+kRO+1, oE1[oe1]-kE1[ke1]+maxKE1, oE2[oe2]-kE2[ke2]+maxKE2, src, dst) = ker(kro+kROhalf, ke1, ke2, src, dst, oe1, oe2);
+                                    }
+                                }
+                            }
+
+                        }
+                    }
+                }
+            }
+        }
+
+        if ( (oE1[0]!=0) && (oE2[0]!=0) && (srcCHA==dstCHA) )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                if ( ROis3rdDim )
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        if ( src == dst )
+                        {
+                            convKer(maxKE1, maxKE2, kRO+1, src, dst) = 1.0;
+                        }
+                        else
+                        {
+                            convKer(maxKE1, maxKE2, kRO+1, src, dst) = 0.0;
+                        }
+                    }
+                }
+                else
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        if ( src == dst )
+                        {
+                            convKer(kRO+1, maxKE1, maxKE2, src, dst) = 1.0;
+                        }
+                        else
+                        {
+                            convKer(kRO+1, maxKE1, maxKE2, src, dst) = 0.0;
+                        }
+                    }
+                }
+            }
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::kspaceDomainConvKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernel3D(const ho7DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, int ro, int e1, int e2, hoNDArray<T>& kIm)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(3));
+        int dstCHA = (int)(ker.get_size(4));
+
+        int kNE1 = (int)(kE1.size());
+        int oNE1 = (int)(oE1.size());
+
+        int kNE2 = (int)(kE2.size());
+        int oNE2 = (int)(oE2.size());
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusGRAPPA<T>::imageDomainKernel(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        // allocate image domain kernel
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - create kIm array ... "));
+        if ( kIm.get_number_of_elements() < (size_t)ro*e1*e2*srcCHA*dstCHA )
+        {
+            kIm.create(ro, e1, e2, srcCHA, dstCHA);
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        ho5DArray<T> convKer;
+        bool ROis3rdDim = false;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1, kE2, oE1, oE2, convKer, ROis3rdDim));
+
+        /*GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - convert to conv kernel ... "));
+        /// fill the convolution kernels
+        int convKRO = 2*kRO+3;
+
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        int maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        int convKE2 = 2*maxKE2+1;
+
+        /// allocate the convolution kernel
+        ho5DArray<T> convKer(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+        Gadgetron::clear(&convKer);
+
+        /// index
+        int oe1, oe2, kro, ke1, ke2, src, dst;
+
+        /// fill the convolution kernel and sum up multiple kernels
+        for ( oe2=0; oe2<oNE2; oe2++ )
+        {
+            for ( oe1=0; oe1<oNE1; oe1++ )
+            {
+                for ( ke2=0; ke2<kNE2; ke2++ )
+                {
+                    for ( ke1=0; ke1<kNE1; ke1++ )
+                    {
+                        for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            for ( dst=0; dst<dstCHA; dst++ )
+                            {
+                                for ( src=0; src<srcCHA; src++ )
+                                {
+                                    convKer(-kro+kRO+1, oE1[oe1]-kE1[ke1]+maxKE1, oE2[oe2]-kE2[ke2]+maxKE2, src, dst) = ker(kro+kROhalf, ke1, ke2, src, dst, oe1, oe2);
+                                }
+                            }
+
+                        }
+                    }
+                }
+            }
+        }
+
+        if ( (oE1[0]!=0) && (oE2[0]!=0) && (srcCHA==dstCHA) )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                for ( src=0; src<srcCHA; src++ )
+                {
+                    if ( src == dst )
+                    {
+                        convKer(kRO+1, maxKE1, maxKE2, src, dst) = 1.0;
+                    }
+                    else
+                    {
+                        convKer(kRO+1, maxKE1, maxKE2, src, dst) = 0.0;
+                    }
+                }
+            }
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());*/
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - SNR unit scaling ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1*e2)) ), convKer ));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - zero padding ... "));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKer, ro, e1, e2, kIm));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - conver to image domain ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kIm));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::imageDomainKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernelRO3D(const ho7DArray<T>& ker, int kRO, const std::vector<int>& kE1, const std::vector<int>& kE2, const std::vector<int>& oE1, const std::vector<int>& oE2, int ro, hoNDArray<T>& kImRO)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(3));
+        int dstCHA = (int)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1.size()==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2.size()==ker.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(oE1.size()==ker.get_size(5));
+        GADGET_CHECK_RETURN_FALSE(oE2.size()==ker.get_size(6));
+
+        bool ROat3rdDim = false;
+        ho5DArray<T> convKer;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1,  kE2, oE1, oE2, convKer, ROat3rdDim));
+
+        // allocate image domain kernel
+        size_t kConvE1 = convKer.get_size(1);
+        size_t kConvE2 = convKer.get_size(2);
+
+        kImRO.create(kConvE1, kConvE2, ro, srcCHA, dstCHA);
+
+        hoNDArray<T> kImROTemp(ro, kConvE1, kConvE2, srcCHA, dstCHA);
+        Gadgetron::clear(kImROTemp);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - SNR unit scaling ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro)) ), convKer ));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKer, "convKer_scal_RO");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - zero padding only for RO ... "));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKer, ro, kConvE1, kConvE2, kImROTemp));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImROTemp, "convKer_scal_RO_zeropadded");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - conver to image domain only for RO ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kImROTemp));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - permute kernel dimensions to be [kE1 kE2 RO ...]  ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo3rdDimensionFor3DRecon(kImROTemp, kImRO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::imageDomainKernelRO3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusGRAPPA<T>::
+imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, int e1, int e2, hoNDArray<T>& kImE1E2RO)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = kImRO.get_dimensions();
+
+        std::vector<size_t> dimR(*dim);
+        dimR[0] = e1;
+        dimR[1] = e2;
+
+        kImE1E2RO.create(&dimR);
+        Gadgetron::clear(kImE1E2RO);
+
+        hoNDArray<T> kImROScaled(kImRO);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - SNR unit scaling for E1 and E2 ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(e1*e2)) ), kImROScaled ));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImROScaled, "kImROScaledE1E2");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - zero padding for E1 and E2 ... "));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(kImROScaled, e1, e2, dimR[2], kImE1E2RO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImE1E2RO, "kImE1E2RO_zeropadded_E1E2");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration - conver to image domain for E1 and E2 ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kImE1E2RO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusGRAPPA<T>::imageDomainKernelE1E2RO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusOperator.h b/toolboxes/gtplus/algorithm/gtPlusOperator.h
new file mode 100644
index 0000000..b78d2ea
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusOperator.h
@@ -0,0 +1,238 @@
+/** \file       gtPlusOperator.h
+    \brief      Base class for gtPlus operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusMemoryManager.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusOperator
+{
+public:
+
+    gtPlusOperator();
+    virtual ~gtPlusOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // adjoint - forward operator
+    virtual bool adjointforwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // compute gradient
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g) = 0;
+
+    // compute cost value
+    virtual bool obj(const hoNDArray<T>& x, T& obj) = 0;
+
+    // restore acquired kspace points to x
+    virtual bool restoreAcquiredKSpace(const hoNDArray<T>& acquired, hoNDArray<T>& y);
+
+    // set the memory manager
+    void setMemoryManager(boost::shared_ptr<gtPlusMemoryManager>& memManager);
+
+    // set the acquired kspace, unacquired points are set to be zero
+    virtual bool setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace);
+
+    // set the coil sensivity map
+    virtual bool setCoilSenMap(boost::shared_ptr< hoNDArray<T> >& senMap);
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<T> gtPlus_util_complex_;
+
+protected:
+
+    // acquired kspace (unacquired points are zeros)
+    boost::shared_ptr< hoNDArray<T> > acquired_points_;
+    // acquired point indicator array, acquired points as 1, otherwise, 0
+    hoNDArray<T> acquired_points_indicator_;
+    // unacquired point indicator array
+    hoNDArray<T> unacquired_points_indicator_;
+
+    // coil map
+    boost::shared_ptr< hoNDArray<T> > coil_senMap_;
+
+    // memory manager
+    boost::shared_ptr<gtPlusMemoryManager> gtPlus_mem_manager_;
+
+    // helper memory
+    hoNDArray<T> kspace_;
+    hoNDArray<T> complexIm_;
+    hoNDArray<T> res_after_apply_kernel_;
+    hoNDArray<T> res_after_apply_kernel_sum_over_;
+
+    hoNDArrayMemoryManaged<T> kspace_Managed_;
+    hoNDArrayMemoryManaged<T> complexIm_Managed_;
+    hoNDArrayMemoryManaged<T> res_after_apply_kernel_Managed_;
+    hoNDArrayMemoryManaged<T> res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusOperator<T>::gtPlusOperator() : performTiming_(false)
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusOperator<T>::~gtPlusOperator()
+{
+}
+
+template <typename T> 
+void gtPlusOperator<T>::setMemoryManager(boost::shared_ptr<gtPlusMemoryManager>& memManager)
+{
+    if ( gtPlus_mem_manager_ )
+    {
+        kspace_Managed_.setMemoryManager(gtPlus_mem_manager_);
+        complexIm_Managed_.setMemoryManager(gtPlus_mem_manager_);
+        res_after_apply_kernel_Managed_.setMemoryManager(gtPlus_mem_manager_);
+        res_after_apply_kernel_sum_over_Managed_.setMemoryManager(gtPlus_mem_manager_);
+    }
+}
+
+template <typename T> 
+void gtPlusOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD operator ------------------" << endl;
+    os << "Operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::adjointforwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    hoNDArray<T> a(x);
+    GADGET_CHECK_RETURN_FALSE(this->forwardOperator(x, a));
+    GADGET_CHECK_RETURN_FALSE(this->adjointOperator(a, y));
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::restoreAcquiredKSpace(const hoNDArray<T>& acquired, hoNDArray<T>& y)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acquired.get_number_of_elements()==y.get_number_of_elements());
+
+        size_t N = acquired.get_number_of_elements();
+
+        const T* pA = acquired.get_data_ptr();
+        T* pY = y.get_data_ptr();
+
+        int n;
+        #pragma omp parallel for default(none) private(n) shared(N, pA, pY)
+        for ( n=0; n<(int)N; n++ )
+        {
+            if ( std::abs(pA[n]) > 0 )
+            {
+                pY[n] = pA[n];
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusOperator<T>::restoreAcquiredKSpace(const hoNDArray<T>& acquired, hoNDArray<T>& y) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::
+setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace)
+{
+    try
+    {
+        acquired_points_ = kspace;
+
+        acquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(acquired_points_indicator_);
+
+        unacquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(unacquired_points_indicator_);
+
+        size_t N = kspace->get_number_of_elements();
+
+        long long ii;
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(shared) private(ii) shared(N)
+        #else
+            #pragma omp parallel for default(shared) private(ii) shared(N, kspace)
+        #endif
+        for ( ii=0; ii<(long long)N; ii++ )
+        {
+            if ( std::abs( (*kspace)(ii) ) < DBL_EPSILON )
+            {
+                unacquired_points_indicator_(ii) = T(1.0);
+            }
+            else
+            {
+                acquired_points_indicator_(ii) = T(1.0);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusOperator<T>::setAcquiredPoints(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusOperator<T>::
+setCoilSenMap(boost::shared_ptr< hoNDArray<T> >& senMap)
+{
+    try
+    {
+        coil_senMap_ = senMap;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusOperator<T>::setCoilSenMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT.h
new file mode 100644
index 0000000..def68fc
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT.h
@@ -0,0 +1,1048 @@
+
+/** \file   gtPlusSPIRIT.h
+    \brief  SPIRIT kernel estimation for 2D and 3D MRI parallel imaging
+    \author Hui Xue
+
+    References to the implementation can be found in:
+
+    Lustig M, Pauly JM. 
+    SPIRiT: Iterative self-consistent parallel imaging reconstruction from arbitrary k-space. 
+    Magnetic Resonance in Medicine 2010;64(2):457-471.
+
+    ISMRM 2013 sunrise course on Parallel Imaging
+    Michael S. Hansen, Philip Beatty
+    http://gadgetron.sourceforge.net/sunrise/
+    http://cds.ismrm.org/protected/13MPresentations/abstracts/7059.pdf
+*/
+
+#pragma once
+
+#include "gtPlusAlgorithmBase.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT : public gtPlusAlgorithmBase<T>
+{
+public:
+
+    typedef gtPlusAlgorithmBase<T> BaseClass;
+
+    typedef typename realType<T>::Type ValueType;
+
+    gtPlusSPIRIT() : calib_use_gpu_(true), BaseClass() {}
+    virtual ~gtPlusSPIRIT() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // SPIRIT calibration for 2D case
+    // acsSrc : [RO E1 srcCHA]
+    // acsDst : [RO E1 dstCHA]
+    // ker : [kRO kE1 srcCHA dstCHA 1 1]
+    bool calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, 
+            int kRO, int kE1, int a, int b, ho6DArray<T>& ker);
+
+    // image domain kernel for 2D kernel
+    // kIm: image domain kernel [RO E1 srcCHA dstCHA]
+    // if minusI==true, compute image domain G-I kernel
+    bool imageDomainKernel(const ho6DArray<T>& ker, int kRO, int kE1, int a, int b, int ro, int e1, hoNDArray<T>& kIm, bool minusI=false);
+
+    // SPIRIT calibration for 3D case
+    // acsSrc : [RO E1 E2 srcCHA]
+    // acsDst : [RO E1 E2 dstCHA]
+    // ker : [kRO kE1 kE2 srcCHA dstCHA 1 1 1]
+    // overDetermineRatio : over determine ratio of calib matrix, if < 1, all data are used
+    bool calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, double thres, double overDetermineRatio, 
+            int kRO, int kE1, int kE2, int a, int b, int c, hoNDArray<T>& ker);
+
+    // convert the calibrated kernel to the convlution kernel in kspace
+    // if ROis3rdDim == true, the kernel dimension is [E1 E2 RO], otherwise [RO E1 E2]
+    bool kspaceDomainConvKernel3D(const hoNDArray<T>& ker, int kRO, int kE1, int kE2, int a, int b, int c, ho5DArray<T>& convKerFlip, bool minusI=true, bool ROis3rdDim=true);
+
+    // image domain kernel for 3D kernel
+    // kIm: image domain kernel [E1 E2 RO srcCHA dstCHA]
+    // if minusI==true, compute image domain G-I kernel
+    bool imageDomainKernel3D(const hoNDArray<T>& ker, int kRO, int kE1, int kE2, 
+        int a, int b, int c, int ro, int e1, int e2, hoNDArray<T>& kIm, bool minusI=false);
+
+    // image domain kernel for 3D kernel, only RO direction is converted to image domain
+    // E1 and E2 stays in the kspace domain
+    // kImRO: kspace-image hybrid kernel [convE1 convE2 RO srcCHA dstCHA]
+    bool imageDomainKernelRO3D(const hoNDArray<T>& ker, int kRO, int kE1, int kE2, 
+        int a, int b, int c, int ro, hoNDArray<T>& kImRO, bool minusI=false);
+
+    // image domain kernel for 3D kernel, E1 and E2 directions are converted to image domain
+    // kImRO : kspace-image hybrid kernel where first two dimensions are E1 and E2 and in kspace
+    bool imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, int e1, int e2, hoNDArray<T>& kImE1E2RO);
+
+    // compute the image domain adjoint kernel
+    bool imageDomainAdjointKernel(const hoNDArray<T>& kIm, hoNDArray<T>& adjkIm);
+
+    // compute the (G-I)'*(G-I)
+    bool AdjointForwardKernel(const hoNDArray<T>& kImS2D, const hoNDArray<T>& kImD2S, hoNDArray<T>& kIm);
+
+    bool calib_use_gpu_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT reconstruction ------------------" << endl;
+    os << "Implementation of SPIRIT algorithms for ISMRMRD package" << endl;
+    os << "Both 2D and 3D version are implemented" << endl;
+    os << "Algorithms are published at:" << endl;
+    os << "Lustig, M. and Pauly, J. M. (2010), SPIRiT: Iterative self-consistent parallel imaging reconstruction from arbitrary k-space. Magn Reson Med, 64: 457-471. doi: 10.1002/mrm.22428" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+calib(const ho3DArray<T>& acsSrc, const ho3DArray<T>& acsDst, double thres, int kRO, int kE1, int a, int b, ho6DArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)>=acsDst.get_size(2));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t srcCHA = acsSrc.get_size(2);
+        size_t dstCHA = acsDst.get_size(2);
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        int kE1half = kE1/2;
+        if ( 2*kE1half == kE1 )
+        {
+            GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib(...) - 2*kE1half == kE1 " << kE1);
+        }
+        kE1 = 2*kE1half + 1;
+
+        // allocate kernel
+        GADGET_CHECK_RETURN_FALSE(ker.createArray(kRO, kE1, srcCHA, dstCHA, 1, 1));
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t sRO = kROhalf;
+        size_t eRO = RO - kROhalf -1;
+
+        size_t sE1 = kE1half;
+        size_t eE1 = E1 - kE1half -1;
+
+        size_t lenRO = eRO-sRO+1;
+        size_t lenE1 = eE1-sE1+1;
+
+        size_t rowA = lenE1*lenRO;
+        size_t colA = (kRO*kE1-1)*srcCHA;
+        size_t colB = dstCHA;
+
+        bool useGPU = (typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_);
+        if ( useGPU )
+        {
+            GADGET_MSG("spirit 2D - calling GPU kernel estimation ... "); 
+        }
+
+        hoMatrix<T> A(rowA, colA);
+        T* pA = A.begin();
+
+        hoMatrix<T> B(rowA, colB);
+        T* pB = B.begin();
+
+        hoMatrix<T> x( A.cols(), B.cols() );
+
+        int dRO, dE1;
+
+        for ( int e1=(int)sE1; e1<=(int)eE1; e1++ )
+        {
+            dE1 = e1;
+
+            for ( int ro=sRO; ro<=(int)eRO; ro++ )
+            {
+                dRO = ro;
+
+                int rInd = (e1-sE1)*lenRO+ro-sRO;
+
+                // fill matrix A
+                size_t col = 0;
+                for ( size_t src=0; src<srcCHA; src++ )
+                {
+                    for ( int ke1=-kE1half; ke1<=kE1half; ke1++ )
+                    {
+                        for ( int kro=-kROhalf; kro<=kROhalf; kro++ )
+                        {
+                            if ( kro!=0 || ke1!=0 )
+                            {
+                                //A(rInd, col++) = acsSrc(ro+kro, e1+ke1, src);
+                                pA[rInd + col*rowA] = acsSrc(ro+kro, e1+ke1, src);
+                                col++;
+                            }
+                        }
+                    }
+                }
+
+                // fill matrix B
+                for ( size_t dst=0; dst<dstCHA; dst++ )
+                {
+                    //B(rInd, dst) = acsDst(dRO, dE1, dst);
+                    pB[rInd+dst*rowA] = acsDst(dRO, dE1, dst);
+                }
+            }
+        }
+
+        #ifdef USE_CUDA
+            // go to device
+            try
+            {
+                if ( useGPU )
+                {
+                    hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                    hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                    int ret(0);
+                    boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+
+                    #pragma omp critical(inverse_spirit)
+                    {
+                        cuNDArray<float_complext> device_A(A_tmp);
+                        cuNDArray<float_complext> device_B(B_tmp);
+                        cuNDArray<float_complext> device_x;
+
+                        ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+                        if ( ret == 0 )
+                        {
+                            host_x = device_x.to_host();
+                        }
+                    }
+
+                    if ( ret != 0 )
+                    {
+                        GADGET_ERROR_MSG("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                        SolveLinearSystem_Tikhonov(A, B, x, thres);
+                    }
+                    else
+                    {
+                        memcpy(x.begin(), host_x->begin(), host_x->get_number_of_bytes());
+                    }
+                }
+                else
+                {
+                    GADGET_WARN_MSG("GPU inverse_clib_matrix is only available for single-precision, calling the CPU version ... ");
+                    SolveLinearSystem_Tikhonov(A, B, x, thres);
+                }
+            }
+            catch(...)
+            {
+                GADGET_ERROR_MSG("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+                SolveLinearSystem_Tikhonov(A, B, x, thres);
+            }
+        #else
+            SolveLinearSystem_Tikhonov(A, B, x, thres);
+        #endif // USE_CUDA
+
+        int ind(0);
+        for ( size_t src=0; src<srcCHA; src++ )
+        {
+            for ( int ke1=-kE1half; ke1<=kE1half; ke1++ ) 
+            {
+                for ( int kro=-kROhalf; kro<=kROhalf; kro++ ) 
+                {
+                    if ( kro!=0 || ke1!=0 )
+                    {
+                        for ( size_t dst=0; dst<dstCHA; dst++ )
+                        {
+                            ker(kro+kROhalf, ke1+kE1half, src, dst, 0, 0) = x(ind, dst);
+                        }
+                        ind++;
+                    }
+                    else
+                    {
+                        for ( size_t dst=0; dst<dstCHA; dst++ )
+                        {
+                            ker(kro+kROhalf, ke1+kE1half, src, dst, 0, 0) = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::calib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernel(const ho6DArray<T>& ker, int kRO, int kE1, int a, int b, int ro, int e1, hoNDArray<T>& kIm, bool minusI)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(2));
+        int dstCHA = (int)(ker.get_size(3));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+
+        int kROhalf = kRO/2;
+        int kE1half = kE1/2;
+
+        // allocate image domain kernel
+        kIm.create(ro, e1, srcCHA, dstCHA);
+
+        /// fill the convolution kernels
+        int convKRO = 2*kRO-1;
+        int convKE1 = 2*kE1-1;
+
+        /// fill in convolution kernel
+        ho6DArray<T> convKer(convKRO, convKE1, srcCHA, dstCHA, 1, 1);
+        Gadgetron::clear(&convKer);
+
+        int kro, ke1, src, dst;
+        for ( ke1=-kE1half; ke1<=kE1half; ke1++ )
+        {
+            for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+            {
+                int iro = kro + kRO -1;
+                int ie1 = ke1 + kE1 -1;
+
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        convKer(iro, ie1, src, dst, 0, 0) = ker(kro+kROhalf, ke1+kE1half, src, dst, 0, 0);
+                    }
+                }
+            }
+        }
+
+        hoNDArray<T> convKer2;
+        ho4DArray<T> conKerMean(convKRO, convKE1, srcCHA, dstCHA);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer, convKer2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer2, conKerMean));
+
+        // flip the kernel
+        ho4DArray<T> convKerFlip(convKRO, convKE1, srcCHA, dstCHA);
+        Gadgetron::clear(&convKerFlip);
+        for ( ke1=0; ke1<convKE1; ke1++ )
+        {
+            for ( kro=0; kro<convKRO; kro++ )
+            {
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    for ( src=0; src<srcCHA; src++ )
+                    {
+                        convKerFlip( kro, ke1, src, dst) = conKerMean(convKRO-1-kro, convKE1-1-ke1, src, dst);
+                    }
+                }
+            }
+        }
+
+        // minus I
+        if ( minusI )
+        {
+            for ( dst=0; dst<dstCHA; dst++ )
+            {
+                T value = convKerFlip(kRO -1, kE1 -1, dst, dst);
+                convKerFlip(kRO -1, kE1 -1, dst, dst) = value - T(1.0);
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1)) ), convKerFlip ));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad2D(convKerFlip, ro, e1, kIm));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::imageDomainKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+calib3D(const ho4DArray<T>& acsSrc, const ho4DArray<T>& acsDst, double thres, double overDetermineRatio, 
+            int kRO, int kE1, int kE2, int a, int b, int c, hoNDArray<T>& ker)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(0)==acsDst.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(1)==acsDst.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(2)==acsDst.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(acsSrc.get_size(3)>=acsDst.get_size(3));
+
+        size_t RO = acsSrc.get_size(0);
+        size_t E1 = acsSrc.get_size(1);
+        size_t E2 = acsSrc.get_size(2);
+        size_t srcCHA = acsSrc.get_size(3);
+        size_t dstCHA = acsDst.get_size(3);
+
+        int kROhalf = kRO/2;
+        if ( 2*kROhalf == kRO )
+        {
+            GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib3D(...) - 2*kROhalf == kRO " << kRO);
+        }
+        kRO = 2*kROhalf + 1;
+
+        int kE1half = kE1/2;
+        if ( 2*kE1half == kE1 )
+        {
+            GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib3D(...) - 2*kE1half == kE1 " << kE1);
+        }
+        kE1 = 2*kE1half + 1;
+
+        int kE2half = kE2/2;
+        if ( 2*kE2half == kE2 )
+        {
+            GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib3D(...) - 2*kE2half == kE2 " << kE2);
+        }
+        kE2 = 2*kE2half + 1;
+
+        // allocate kernel
+        ker.create(kRO, kE1, kE2, srcCHA, dstCHA, 1, 1, 1);
+
+        // loop over the calibration region and assemble the equation
+        // Ax = b
+
+        size_t sRO = kROhalf;
+        size_t eRO = RO - kROhalf -1;
+        size_t lenRO = eRO-sRO+1;
+
+        size_t sE1 = kE1half;
+        size_t eE1 = E1 - kE1half -1;
+        size_t lenE1 = eE1-sE1+1;
+
+        size_t sE2 = kE2half;
+        size_t eE2 = E2 - kE2half -1;
+        size_t lenE2 = eE2-sE2+1;
+
+        size_t colA = (kRO*kE1*kE2-1)*srcCHA;
+        if ( overDetermineRatio > 1.0 )
+        {
+            size_t maxRowA = std::ceil(overDetermineRatio*colA);
+            size_t maxROUsed = maxRowA/(lenE1*lenE2);
+            if ( maxROUsed < lenRO )
+            {
+                // find the peak signal of acsSrc
+                hoNDArray<T> acsSrc1stCha(RO, E1, E2, const_cast<T*>(acsSrc.begin()));
+                hoNDArray<T> acsSrc1stChaSumE2(RO, E1, 1), acsSrc1stChaSumE2E1(RO, 1, 1);
+
+                if ( Gadgetron::sumOver3rdDimension(acsSrc1stCha, acsSrc1stChaSumE2) )
+                {
+                    if ( Gadgetron::sumOver2ndDimension(acsSrc1stChaSumE2, acsSrc1stChaSumE2E1) )
+                    {
+                        T maxSignal;
+                        size_t roInd;
+                        if ( Gadgetron::maxAbsolute(acsSrc1stChaSumE2E1, maxSignal, roInd) )
+                        {
+                            sRO = roInd - maxROUsed/2;
+                            eRO = sRO + maxROUsed - 1;
+                            lenRO = eRO-sRO+1;
+                            GADGET_MSG("gtPlusSPIRIT<T>::calib3D(...) - overDetermineRatio = " << overDetermineRatio << " ; RO data range used : [" << sRO << " " << eRO << "] ...");
+                        }
+                        else
+                        {
+                            GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                        }
+                    }
+                }
+                else
+                {
+                    GADGET_WARN_MSG("gtPlusSPIRIT<T>::calib3D(...) - overDetermineRatio is ignored ... ");
+                }
+            }
+        }
+
+        size_t rowA = lenRO*lenE1*lenE2;
+        size_t colB = dstCHA;
+
+        bool useGPU = (typeid(typename realType<T>::Type)==typeid(float) && calib_use_gpu_);
+        if ( useGPU )
+        {
+            GADGET_MSG("spirit 3D - calling GPU kernel estimation ... ");
+        }
+
+        hoMatrix<T> A(rowA, colA);
+        T* pA = A.begin();
+
+        hoMatrix<T> B(rowA, colB);
+        T* pB = B.begin();
+
+        hoMatrix<T> x( A.cols(), B.cols() );
+
+        int dRO, dE1, dE2;
+
+        for ( int e2=(int)sE2; e2<=(int)eE2; e2++ )
+        {
+            dE2 = e2;
+
+            for ( int e1=(int)sE1; e1<=(int)eE1; e1++ )
+            {
+                dE1 = e1;
+
+                for ( int ro=sRO; ro<=(int)eRO; ro++ )
+                {
+                    dRO = ro;
+
+                    int rInd = (e2-sE2)*lenRO*lenE1 + (e1-sE1)*lenRO + ro-sRO;
+
+                    // fill matrix A
+                    size_t col = 0;
+                    for ( size_t src=0; src<srcCHA; src++ )
+                    {
+                        for ( int ke2=-kE2half; ke2<=kE2half; ke2++ )
+                        {
+                            for ( int ke1=-kE1half; ke1<=kE1half; ke1++ )
+                            {
+                                for ( int kro=-kROhalf; kro<=kROhalf; kro++ )
+                                {
+                                    if ( kro!=0 || ke1!=0 || ke2!=0 )
+                                    {
+                                        //A(rInd, col++) = acsSrc(ro+kro, e1+ke1, e2+ke2, src);
+                                        pA[rInd+col*rowA] = acsSrc(ro+kro, e1+ke1, e2+ke2, src);
+                                        col++;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    // fill matrix B
+                    for ( size_t dst=0; dst<dstCHA; dst++ )
+                    {
+                        //B(rInd, dst) = acsDst(dRO, dE1, dE2, dst);
+                        pB[rInd+dst*rowA] = acsDst(dRO, dE1, dE2, dst);
+                    }
+                }
+            }
+        }
+
+        #ifdef USE_CUDA
+            // go to device
+            try
+            {
+                if ( useGPU )
+                {
+                    hoNDArray<float_complext> A_tmp(A.get_dimensions(), reinterpret_cast<float_complext*>(A.begin()));
+                    hoNDArray<float_complext> B_tmp(B.get_dimensions(), reinterpret_cast<float_complext*>(B.begin()));
+
+                    int ret(0);
+                    boost::shared_ptr< hoNDArray<complext<float> > > host_x;
+                    #pragma omp critical(inverse_spirit3D)
+                    {
+                        cuNDArray<float_complext> device_A(A_tmp);
+                        cuNDArray<float_complext> device_B(B_tmp);
+                        cuNDArray<float_complext> device_x;
+
+                        ret = Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres);
+                        if ( ret == 0 )
+                        {
+                            host_x = device_x.to_host();
+                        }
+                    }
+
+                    if ( ret != 0 )
+                    {
+                        GADGET_ERROR_MSG("failed in Gadgetron::inverse_clib_matrix(&device_A, &device_B, &device_x, thres) ... ");
+                        SolveLinearSystem_Tikhonov(A, B, x, thres);
+                    }
+                    else
+                    {
+                        memcpy(x.begin(), host_x->begin(), x.get_number_of_bytes());
+                    }
+                }
+                else
+                {
+                    GADGET_WARN_MSG("GPU inverse_clib_matrix is only available for single-precision, calling the CPU version ... ");
+                    SolveLinearSystem_Tikhonov(A, B, x, thres);
+                }
+            }
+            catch(...)
+            {
+                GADGET_ERROR_MSG("failed in GPU inverse_clib_matrix for grappa, calling the CPU version ... ");
+                SolveLinearSystem_Tikhonov(A, B, x, thres);
+            }
+        #else
+            SolveLinearSystem_Tikhonov(A, B, x, thres);
+        #endif // USE_CUDA
+
+        int ind(0);
+
+        std::vector<size_t> kerInd(8);
+        kerInd[7] = 0;
+        kerInd[6] = 0;
+        kerInd[5] = 0;
+
+        for ( size_t src=0; src<srcCHA; src++ )
+        {
+            kerInd[3] = src;
+            for ( int ke2=-kE2half; ke2<=kE2half; ke2++ ) 
+            {
+                kerInd[2] = ke2+kE2half;
+                for ( int ke1=-kE1half; ke1<=kE1half; ke1++ ) 
+                {
+                    kerInd[1] = ke1+kE1half;
+                    for ( int kro=-kROhalf; kro<=kROhalf; kro++ ) 
+                    {
+                        kerInd[0] = kro+kROhalf;
+
+                        if ( kro!=0 || ke1!=0 || ke2!=0 )
+                        {
+                            for ( size_t dst=0; dst<dstCHA; dst++ )
+                            {
+                                kerInd[4] = dst;
+                                size_t offset = ker.calculate_offset(kerInd);
+                                ker(offset) = x(ind, dst);
+                            }
+                            ind++;
+                        }
+                        else
+                        {
+                            for ( size_t dst=0; dst<dstCHA; dst++ )
+                            {
+                                kerInd[4] = dst;
+                                size_t offset = ker.calculate_offset(kerInd);
+                                ker(offset) = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::calib3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+kspaceDomainConvKernel3D(const hoNDArray<T>& ker, int kRO, int kE1, int kE2, int a, int b, int c, ho5DArray<T>& convKerFlip, bool minusI, bool ROis3rdDim)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(3));
+        int dstCHA = (int)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2==ker.get_size(2));
+
+        int kROhalf = kRO/2;
+        int kE1half = kE1/2;
+        int kE2half = kE2/2;
+
+        /// fill the convolution kernels
+        int convKRO = 2*kRO-1;
+        int convKE1 = 2*kE1-1;
+        int convKE2 = 2*kE2-1;
+
+        /// fill in convolution kernel
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - convert to conv kernel ... "));
+
+        hoNDArray<T> convKer(convKRO, convKE1, convKE2, srcCHA, dstCHA, 1, 1, 1);
+        Gadgetron::clear(&convKer);
+
+        int kro, ke1, ke2, src, dst;
+        std::vector<size_t> kerInd(8), convKerInd(8);
+
+        kerInd[7] = 0;
+        convKerInd[7] = 0;
+
+        kerInd[6] = 0;
+        convKerInd[6] = 0;
+
+        kerInd[5] = 0;
+        convKerInd[5] = 0;
+
+        for ( ke2=-kE2half; ke2<=kE2half; ke2++ )
+        {
+            kerInd[2] = ke2+kE2half;
+
+            for ( ke1=-kE1half; ke1<=kE1half; ke1++ )
+            {
+                kerInd[1] = ke1+kE1half;
+
+                for ( kro=-kROhalf; kro<=kROhalf; kro++ )
+                {
+                    int iro = kro + kRO -1;
+                    int ie1 = ke1 + kE1 -1;
+                    int ie2 = ke2 + kE2 -1;
+
+                    kerInd[0] = kro+kROhalf;
+
+                    convKerInd[0] = iro;
+                    convKerInd[1] = ie1;
+                    convKerInd[2] = ie2;
+
+                    for ( dst=0; dst<dstCHA; dst++ )
+                    {
+                        kerInd[4] = dst;
+                        convKerInd[4] = dst;
+
+                        for ( src=0; src<srcCHA; src++ )
+                        {
+                            kerInd[3] = src;
+                            convKerInd[3] = src;
+
+                            size_t offsetKer = ker.calculate_offset(kerInd);
+                            size_t offsetConvKer = convKer.calculate_offset(convKerInd);
+
+                            convKer(offsetConvKer) = ker(offsetKer);
+                        }
+                    }
+                }
+            }
+        }
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - sum over output dimensions ... "));
+        hoNDArray<T> convKer2, convKer3;
+        ho5DArray<T> convKernMean(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer, convKer2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer2, convKer3));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(convKer3, convKernMean));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - flip along dimensions ... "));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKernMean, "convKernMean");
+
+        // flip the kernel
+        if ( ROis3rdDim ) // E1, E2, RO
+        {
+            convKerFlip.createArray(convKE1, convKE2, convKRO, srcCHA, dstCHA);
+            Gadgetron::clear(&convKerFlip);
+
+            for ( ke2=0; ke2<convKE2; ke2++ )
+            {
+                for ( ke1=0; ke1<convKE1; ke1++ )
+                {
+                    for ( kro=0; kro<convKRO; kro++ )
+                    {
+                        for ( dst=0; dst<dstCHA; dst++ )
+                        {
+                            for ( src=0; src<srcCHA; src++ )
+                            {
+                                T value = convKernMean(convKRO-1-kro, convKE1-1-ke1, convKE2-1-ke2, src, dst);
+                                convKerFlip(ke1, ke2, kro, src, dst) = value;
+                            }
+                        }
+                    }
+                }
+            }
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKerFlip, "convKerFlip");
+
+            // minus I
+            if ( minusI )
+            {
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    T value = convKerFlip(kE1 -1, kE2 -1, kRO -1, dst, dst);
+                    convKerFlip(kE1 -1, kE2 -1, kRO -1, dst, dst) = value - T(1.0);
+                }
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKerFlip, "convKerFlip_minusI");
+            }
+        }
+        else
+        {
+            // RO, E1, E2
+            convKerFlip.createArray(convKRO, convKE1, convKE2, srcCHA, dstCHA);
+            Gadgetron::clear(&convKerFlip);
+
+            for ( ke2=0; ke2<convKE2; ke2++ )
+            {
+                for ( ke1=0; ke1<convKE1; ke1++ )
+                {
+                    for ( kro=0; kro<convKRO; kro++ )
+                    {
+                        for ( dst=0; dst<dstCHA; dst++ )
+                        {
+                            for ( src=0; src<srcCHA; src++ )
+                            {
+                                T value = convKernMean(convKRO-1-kro, convKE1-1-ke1, convKE2-1-ke2, src, dst);
+                                convKerFlip(kro, ke1, ke2, src, dst) = value;
+                            }
+                        }
+                    }
+                }
+            }
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKerFlip, "convKerFlip");
+
+            // minus I
+            if ( minusI )
+            {
+                for ( dst=0; dst<dstCHA; dst++ )
+                {
+                    T value = convKerFlip(kRO -1, kE1 -1, kE2 -1, dst, dst);
+                    convKerFlip(kRO -1, kE1 -1, kE2 -1, dst, dst) = value - T(1.0);
+                }
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKerFlip, "convKerFlip_minusI");
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::kspaceDomainConvKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernel3D(const hoNDArray<T>& ker, int kRO, int kE1, int kE2, int a, int b, int c, int ro, int e1, int e2, hoNDArray<T>& kIm, bool minusI)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(3));
+        int dstCHA = (int)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2==ker.get_size(2));
+
+        // allocate image domain kernel
+        kIm.create(e1, e2, ro, srcCHA, dstCHA);
+
+        bool ROat3rdDim = true;
+        ho5DArray<T> convKerFlip;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1,  kE2, a, b, c, convKerFlip, minusI, ROat3rdDim));
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - SNR unit scaling ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro*e1*e2)) ), convKerFlip ));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKerFlip, "convKerFlip_scal");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - zero padding ... "));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKerFlip, e1, e2, ro, kIm));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kIm, "convKerFlip_scal_zeropadded");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - conver to image domain ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kIm));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::imageDomainKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernelRO3D(const hoNDArray<T>& ker, int kRO, int kE1, int kE2, int a, int b, int c, int ro, hoNDArray<T>& kImRO, bool minusI)
+{
+    try
+    {
+        int srcCHA = (int)(ker.get_size(3));
+        int dstCHA = (int)(ker.get_size(4));
+
+        GADGET_CHECK_RETURN_FALSE(kRO==ker.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kE1==ker.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kE2==ker.get_size(2));
+
+        bool ROat3rdDim = false;
+        ho5DArray<T> convKerFlip;
+        GADGET_CHECK_RETURN_FALSE(this->kspaceDomainConvKernel3D(ker, kRO, kE1,  kE2, a, b, c, convKerFlip, minusI, ROat3rdDim));
+
+        // allocate image domain kernel
+        size_t kConvE1 = convKerFlip.get_size(1);
+        size_t kConvE2 = convKerFlip.get_size(2);
+
+        kImRO.create(kConvE1, kConvE2, ro, srcCHA, dstCHA);
+
+        hoNDArray<T> kImROTemp(ro, kConvE1, kConvE2, srcCHA, dstCHA);
+        Gadgetron::clear(kImROTemp);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - SNR unit scaling ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(ro)) ), convKerFlip ));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, convKerFlip, "convKerFlip_scal_RO");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - zero padding only for RO ... "));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(convKerFlip, ro, kConvE1, kConvE2, kImROTemp));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImROTemp, "convKerFlip_scal_RO_zeropadded");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - conver to image domain only for RO ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kImROTemp));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - permute kernel dimensions to be [kE1 kE2 RO ...]  ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo3rdDimensionFor3DRecon(kImROTemp, kImRO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::imageDomainKernel3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainKernelE1E2RO(const hoNDArray<T>& kImRO, int e1, int e2, hoNDArray<T>& kImE1E2RO)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = kImRO.get_dimensions();
+
+        std::vector<size_t> dimR(*dim);
+        dimR[0] = e1;
+        dimR[1] = e2;
+
+        kImE1E2RO.create(&dimR);
+        Gadgetron::clear(kImE1E2RO);
+
+        hoNDArray<T> kImROScaled(kImRO);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - SNR unit scaling for E1 and E2 ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)( std::sqrt((double)(e1*e2)) ), kImROScaled ));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImROScaled, "kImROScaledE1E2");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - zero padding for E1 and E2 ... "));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().zeropad3DNoPresetZeros(kImROScaled, e1, e2, dimR[2], kImE1E2RO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImE1E2RO, "kImE1E2RO_zeropadded_E1E2");
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D calibration - conver to image domain for E1 and E2 ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kImE1E2RO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::imageDomainKernelE1E2RO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::
+imageDomainAdjointKernel(const hoNDArray<T>& kIm, hoNDArray<T>& adjkIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteLastTwoDimensions(kIm, adjkIm));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::conjugate(adjkIm, adjkIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::imageDomainAdjointKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT<T>::AdjointForwardKernel(const hoNDArray<T>& kImS2D, const hoNDArray<T>& kImD2S, hoNDArray<T>& kIm)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimS2D = kImS2D.get_dimensions();
+
+        size_t NDim = kImS2D.get_number_of_dimensions();
+
+        long long srcCHA = (*dimS2D)[NDim-2];
+        long long dstCHA = (*dimS2D)[NDim-1];
+
+        GADGET_CHECK_RETURN_FALSE(kImD2S.get_number_of_dimensions()==NDim);
+        GADGET_CHECK_RETURN_FALSE(kImD2S.get_number_of_elements()==kImS2D.get_number_of_elements());
+
+        std::vector<size_t> dimRes(*dimS2D);
+        dimRes[NDim-2] = dstCHA;
+
+        kIm.create(&dimRes);
+        Gadgetron::clear(&kIm);
+
+        size_t N = kImS2D.get_number_of_elements()/srcCHA/dstCHA;
+
+        long long d;
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(d) shared(N, dstCHA, srcCHA) num_threads(dstCHA) if (dstCHA > 4)
+        #else
+            #pragma omp parallel default(none) private(d) shared(N, dstCHA, srcCHA, kIm, kImS2D, kImD2S) num_threads(dstCHA) if (dstCHA > 4)
+        #endif
+        {
+            hoNDArray<T> ker(N);
+
+            #pragma omp for
+            for ( d=0; d<dstCHA; d++ )
+            {
+                for ( size_t dprime=0; dprime<dstCHA; dprime++ )
+                {
+                    hoNDArray<T> dKer(N, kIm.begin()+d*N+dprime*N*dstCHA);
+
+                    for ( size_t s=0; s<srcCHA; s++ )
+                    {
+                        hoNDArray<T> kerS2D(N, const_cast<T*>(kImS2D.begin())+s*N+dprime*N*srcCHA);
+                        hoNDArray<T> kerD2S(N, const_cast<T*>(kImD2S.begin())+d*N+s*N*dstCHA);
+
+                        Gadgetron::multiply(kerS2D, kerD2S, ker);
+                        Gadgetron::add(dKer, ker, dKer);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT<T>::AdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DOperator.h
new file mode 100644
index 0000000..5ecac83
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DOperator.h
@@ -0,0 +1,206 @@
+/** \file       gtPlusSPIRIT2DOperator.h
+    \brief      Base class for gtPlus 2D operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT2DOperator : public gtPlusSPIRITOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITOperator<T> BaseClass;
+
+    gtPlusSPIRIT2DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRIT2DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+    // forward
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    using BaseClass::use_symmetric_spirit_;
+    using BaseClass::use_non_centered_fft_;
+    using BaseClass::calib_use_gpu_;
+
+protected:
+
+    // [RO E1 srcCHA dstCHA]
+    using BaseClass::forward_kernel_;
+    using BaseClass::adjoint_kernel_;
+    using BaseClass::adjoint_forward_kernel_;
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2D operator ------------------" << endl;
+    os << "Implementation of SPIRIT 2D operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( this->use_non_centered_fft_ )
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2(x, im));
+    }
+    else
+    {
+        if ( !complexIm_Managed_.dimensions_equal(&x) )
+        {
+            complexIm_Managed_.create(x.get_dimensions());
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(x, im, complexIm_Managed_));
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( this->use_non_centered_fft_ )
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2(im, x));
+    }
+    else
+    {
+        if ( !kspace_Managed_.dimensions_equal(&im) )
+        {
+            kspace_Managed_.create(im.get_dimensions());
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(im, x, kspace_Managed_));
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        Gadgetron::multiply(unacquired_points_indicator_, x, y);
+
+        // x to image domain
+        this->convertToImage(y, complexIm_);
+
+        size_t ro = x.get_size(0);
+        size_t e1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+
+        if ( res_after_apply_kernel_sum_over_.get_number_of_elements() < ro*e1*CHA )
+        {
+            res_after_apply_kernel_sum_over_.create(ro, e1, CHA);
+        }
+
+        hoNDArray<T>* kerArray;
+        if ( use_symmetric_spirit_ )
+        {
+            kerArray = this->adjoint_forward_kernel_.get();
+        }
+        else
+        {
+            kerArray = this->forward_kernel_.get();
+        }
+
+        Gadgetron::imageDomainUnwrapping2D(complexIm_, *kerArray, res_after_apply_kernel_sum_over_, y);
+
+        this->convertToKSpace(y, res_after_apply_kernel_sum_over_);
+
+        // apply Dc
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multiply(unacquired_points_indicator_, res_after_apply_kernel_sum_over_, y);
+        }
+        else
+        {
+            memcpy(y.begin(), res_after_apply_kernel_sum_over_.begin(), sizeof(T)*ro*e1*CHA);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT2DOperator<T>::forwardOperator(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT2DOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        if ( use_symmetric_spirit_ )
+        {
+            // Dc(G-I)'(G-I)Dc' is symmetric
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(x, y));
+        }
+        else
+        {
+            // Dc(G-I)'x
+
+            // x to image domain
+            this->convertToImage(x, complexIm_);
+
+            // apply kernel and sum
+            size_t ro = x.get_size(0);
+            size_t e1 = x.get_size(1);
+            size_t CHA = x.get_size(2);
+
+            if ( res_after_apply_kernel_sum_over_.get_number_of_elements() < ro*e1*CHA )
+            {
+                res_after_apply_kernel_sum_over_.create(ro, e1, CHA);
+            }
+
+            Gadgetron::imageDomainUnwrapping2D(complexIm_, *adjoint_kernel_, res_after_apply_kernel_sum_over_, y);
+
+            // go back to kspace 
+            this->convertToKSpace(y, res_after_apply_kernel_sum_over_);
+
+            // apply Dc
+            Gadgetron::multiply(unacquired_points_indicator_, res_after_apply_kernel_sum_over_, y);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::adjointOperator(...) ... ");
+        return false;
+    }
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DTOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DTOperator.h
new file mode 100644
index 0000000..e99a89d
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT2DTOperator.h
@@ -0,0 +1,329 @@
+/** \file       gtPlusSPIRIT2DTOperator.h
+    \brief      Base class for gtPlus 2DT operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRIT2DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT2DTOperator : public gtPlusSPIRIT2DOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRIT2DOperator<T> BaseClass;
+
+    gtPlusSPIRIT2DTOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRIT2DTOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // set forward kernel, compute the adjoint and adjoint_forward kernel
+    bool setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel=true);
+    bool setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel);
+    // set the acquired kspace, unacquired points are set to be zero
+    bool setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace);
+
+    // compute gradient of ||(G-I)(Dc'x+D'y)||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)(Dc'x+D'y)||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    //using BaseClass::gt_timer1_;
+    //using BaseClass::gt_timer2_;
+    //using BaseClass::gt_timer3_;
+    //using BaseClass::performTiming_;
+    //using BaseClass::gt_exporter_;
+    //using BaseClass::debugFolder_;
+    //using BaseClass::gtPlus_util_;
+    //using BaseClass::gtPlus_util_complex_;
+    //using BaseClass::gtPlus_mem_manager_;
+    //using BaseClass::use_symmetric_spirit_;
+
+protected:
+
+    // G-I, [RO E1 srcCHA dstCHA N]
+    //using BaseClass::forward_kernel_;
+    //using BaseClass::adjoint_kernel_;
+    //using BaseClass::adjoint_forward_kernel_;
+    //using BaseClass::acquired_points_;
+    //using BaseClass::acquired_points_indicator_;
+    //using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    //using BaseClass::kspace_;
+    //using BaseClass::complexIm_;
+    //using BaseClass::res_after_apply_kernel_;
+    //using BaseClass::res_after_apply_kernel_sum_over_;
+
+    //using BaseClass::kspace_Managed_;
+    //using BaseClass::complexIm_Managed_;
+    //using BaseClass::res_after_apply_kernel_Managed_;
+    //using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT2DTOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2DT operator ------------------" << endl;
+    os << "Implementation of SPIRIT 2DT operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::
+setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel)
+{
+    try
+    {
+        this->forward_kernel_ = forward_kernel;
+
+        size_t RO = this->forward_kernel_->get_size(0);
+        size_t E1 = this->forward_kernel_->get_size(1);
+        size_t srcCHA = this->forward_kernel_->get_size(2);
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t N = this->forward_kernel_->get_size(4);
+
+        this->adjoint_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, srcCHA, N));
+
+        bool computeAdjointForwardKernel = (computeAdjForwardKernel || this->use_symmetric_spirit_);
+
+        if ( computeAdjointForwardKernel )
+        {
+            this->adjoint_forward_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, dstCHA, N));
+        }
+
+        size_t n;
+        for ( n=0; n<N; n++ )
+        {
+            hoNDArray<T> kerCurr(RO, E1, srcCHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*srcCHA*dstCHA);
+            hoNDArray<T> adjKerCurr(RO, E1, dstCHA, srcCHA, this->adjoint_kernel_->begin()+n*RO*E1*dstCHA*srcCHA);
+
+            GADGET_CHECK_RETURN_FALSE(this->imageDomainAdjointKernel(kerCurr, adjKerCurr));
+
+            if ( computeAdjointForwardKernel )
+            {
+                hoNDArray<T> adjForwardKerCurr(RO, E1, dstCHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*dstCHA*dstCHA);
+                GADGET_CHECK_RETURN_FALSE(this->AdjointForwardKernel(adjKerCurr, kerCurr, adjForwardKerCurr));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT2DTOperator<T>::setForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::
+setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel)
+{
+    try
+    {
+        this->adjoint_forward_kernel_ = adjoint_forward_kernel;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT2DTOperator<T>::setAdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::
+setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace)
+{
+    try
+    {
+        this->acquired_points_ = kspace;
+
+        size_t RO = this->acquired_points_->get_size(0);
+        size_t E1 = this->acquired_points_->get_size(1);
+        size_t srcCHA = this->acquired_points_->get_size(2);
+        size_t E2 = this->acquired_points_->get_size(3);
+
+        this->acquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->acquired_points_indicator_);
+
+        this->unacquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->unacquired_points_indicator_);
+
+        size_t N = kspace->get_number_of_elements();
+
+        long long ii;
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(ii) shared(N)
+        #else
+            #pragma omp parallel for default(none) private(ii) shared(N, kspace)
+        #endif
+        for ( ii=0; ii<(long long)N; ii++ )
+        {
+            if ( std::abs( (*kspace)(ii) ) < DBL_EPSILON )
+            {
+                this->unacquired_points_indicator_(ii) = 1.0;
+            }
+            else
+            {
+                this->acquired_points_indicator_(ii) = 1.0;
+            }
+        }
+
+        // allocate the helper memory
+        this->kspace_.create(RO, E1, srcCHA, E2);
+        this->complexIm_.create(RO, E1, srcCHA, E2);
+
+        if ( this->forward_kernel_ )
+        {
+            size_t dstCHA = this->forward_kernel_->get_size(3);
+            this->res_after_apply_kernel_.create(RO, E1, srcCHA, dstCHA);
+            this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, E2);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT2DTOperator<T>::setAcquiredPoints(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*Dc*(G-I)'(G-I)(D'y+Dc'x)
+
+        // D'y+Dc'x
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(this->unacquired_points_indicator_, x, this->kspace_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(*this->acquired_points_, this->kspace_, this->kspace_));
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(this->kspace_, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->adjoint_forward_kernel_->get_size(3);
+        size_t kernelN = this->adjoint_forward_kernel_->get_size(4);
+
+        this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_adjoint_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(currComplexIm, curr_adjoint_forward_kernel, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, dstCHA, this->res_after_apply_kernel_sum_over_.begin()+n*RO*E1*dstCHA);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(this->res_after_apply_kernel_, sumResCurr));
+        }
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(this->res_after_apply_kernel_sum_over_, g));
+
+        // apply Dc
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(this->unacquired_points_indicator_, g, g));
+
+        // multiply by 2
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(T(2.0), g));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRIT2DTOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)(D'y+Dc'x)||2
+
+        // D'y+Dc'x
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(this->unacquired_points_indicator_, x, this->kspace_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(*this->acquired_points_, this->kspace_, this->kspace_));
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(this->kspace_, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t kernelN = this->forward_kernel_->get_size(4);
+
+        this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(currComplexIm, curr_forward_kernel, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, dstCHA, this->res_after_apply_kernel_sum_over_.begin()+n*RO*E1*dstCHA);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(this->res_after_apply_kernel_, sumResCurr));
+        }
+
+        // L2 norm
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::dotc(this->res_after_apply_kernel_sum_over_, this->res_after_apply_kernel_sum_over_, obj));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRIT2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRIT3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRIT3DOperator.h
new file mode 100644
index 0000000..f52cdac
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRIT3DOperator.h
@@ -0,0 +1,98 @@
+/** \file       gtPlusSPIRIT3DOperator.h
+    \brief      Base class for gtPlus 3D operators
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRIT3DOperator : public gtPlusSPIRITOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITOperator<T> BaseClass;
+
+    gtPlusSPIRIT3DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRIT3DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+    using BaseClass::use_symmetric_spirit_;
+    using BaseClass::use_non_centered_fft_;
+
+protected:
+
+    // [RO E1 srcCHA dstCHA]
+    using BaseClass::forward_kernel_;
+    using BaseClass::djoint_kernel_;
+    using BaseClass::adjoint_forward_kernel_;
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRIT3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 3D operator ------------------" << endl;
+    os << "Implementation of SPIRIT 3D operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT3DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !complexIm_Managed_.dimensions_equal(&x) )
+    {
+        complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(x, im, complexIm_Managed_));
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRIT3DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !kspace_Managed_.dimensions_equal(&im) )
+    {
+        kspace_Managed_.create(im.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(im, x, kspace_Managed_));
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DOperator.h
new file mode 100644
index 0000000..2ed22a3
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DOperator.h
@@ -0,0 +1,68 @@
+/** \file       gtPlusSPIRITNoNullSpace2DOperator.h
+    \brief      Implement SPIRIT 2D operator without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITNoNullSpaceOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpace2DOperator : public gtPlusSPIRITNoNullSpaceOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITNoNullSpaceOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpace2DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpace2DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+protected:
+
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpace2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2D operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT 2D operator for ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace2DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !this->complexIm_Managed_.dimensions_equal(&x) )
+    {
+        this->complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(x, im, this->complexIm_Managed_));
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace2DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !this->kspace_Managed_.dimensions_equal(&im) )
+    {
+        this->kspace_Managed_.create(im.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(im, x, this->kspace_Managed_));
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h
new file mode 100644
index 0000000..132f172
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace2DTOperator.h
@@ -0,0 +1,289 @@
+/** \file       gtPlusSPIRITNoNullSpace2DTOperator.h
+    \brief      Implement SPIRIT 2DT operator without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpace2DTOperator : public gtPlusSPIRITNoNullSpace2DOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITNoNullSpace2DOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpace2DTOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpace2DTOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // set forward kernel, compute the adjoint and adjoint_forward kernel
+    bool setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel=true);
+    bool setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel);
+    // set the acquired kspace, unacquired points are set to be zero
+    bool setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace);
+
+    // compute gradient of ||(G-I)x||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)x||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+protected:
+
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpace2DTOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 2DT operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT 2DT operator for ISMRMRD package" << endl;
+    os << "--------------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::
+setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel)
+{
+    try
+    {
+        this->forward_kernel_ = forward_kernel;
+
+        size_t RO = this->forward_kernel_->get_size(0);
+        size_t E1 = this->forward_kernel_->get_size(1);
+        size_t srcCHA = this->forward_kernel_->get_size(2);
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t N = this->forward_kernel_->get_size(4);
+
+        this->adjoint_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, srcCHA, N));
+
+        bool computeAdjointForwardKernel = (computeAdjForwardKernel || this->use_symmetric_spirit_);
+
+        if ( computeAdjointForwardKernel )
+        {
+            this->adjoint_forward_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, dstCHA, N));
+        }
+
+        size_t n;
+        for ( n=0; n<N; n++ )
+        {
+            hoNDArray<T> kerCurr(RO, E1, srcCHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*srcCHA*dstCHA);
+            hoNDArray<T> adjKerCurr(RO, E1, dstCHA, srcCHA, this->adjoint_kernel_->begin()+n*RO*E1*dstCHA*srcCHA);
+
+            GADGET_CHECK_RETURN_FALSE(this->imageDomainAdjointKernel(kerCurr, adjKerCurr));
+
+            if ( computeAdjointForwardKernel )
+            {
+                hoNDArray<T> adjForwardKerCurr(RO, E1, dstCHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*dstCHA*dstCHA);
+                GADGET_CHECK_RETURN_FALSE(this->AdjointForwardKernel(adjKerCurr, kerCurr, adjForwardKerCurr));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::setForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::
+setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel)
+{
+    try
+    {
+        this->adjoint_forward_kernel_ = adjoint_forward_kernel;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::setAdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::
+setAcquiredPoints(boost::shared_ptr< hoNDArray<T> >& kspace)
+{
+    try
+    {
+        this->acquired_points_ = kspace;
+
+        size_t RO = this->acquired_points_->get_size(0);
+        size_t E1 = this->acquired_points_->get_size(1);
+        size_t srcCHA = this->acquired_points_->get_size(2);
+        size_t E2 = this->acquired_points_->get_size(3);
+
+        this->acquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->acquired_points_indicator_);
+
+        this->unacquired_points_indicator_.create(kspace->get_dimensions());
+        Gadgetron::clear(this->unacquired_points_indicator_);
+
+        size_t N = kspace->get_number_of_elements();
+
+        long long ii;
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(ii) shared(N)
+        #else
+            #pragma omp parallel for default(none) private(ii) shared(N, kspace)
+        #endif
+        for ( ii=0; ii<(long long)N; ii++ )
+        {
+            if ( std::abs( (*kspace)(ii) ) < DBL_EPSILON )
+            {
+                this->unacquired_points_indicator_(ii) = 1.0;
+            }
+            else
+            {
+                this->acquired_points_indicator_(ii) = 1.0;
+            }
+        }
+
+        // allocate the helper memory
+        this->kspace_.create(RO, E1, srcCHA, E2);
+        this->complexIm_.create(RO, E1, srcCHA, E2);
+
+        if ( this->forward_kernel_ )
+        {
+            size_t dstCHA = this->forward_kernel_->get_size(3);
+            this->res_after_apply_kernel_.create(RO, E1, srcCHA, dstCHA);
+            this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, E2);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::setAcquiredPoints(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*(G-I)'(G-I)x
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->adjoint_forward_kernel_->get_size(3);
+        size_t kernelN = this->adjoint_forward_kernel_->get_size(4);
+
+        this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_adjoint_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_adjoint_forward_kernel.create(RO, E1, CHA, dstCHA, this->adjoint_forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(currComplexIm, curr_adjoint_forward_kernel, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, dstCHA, this->res_after_apply_kernel_sum_over_.begin()+n*RO*E1*dstCHA);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(this->res_after_apply_kernel_, sumResCurr));
+        }
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(this->res_after_apply_kernel_sum_over_, g));
+
+        // multiply by 2
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(T(2.0), g));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpace2DTOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)x||2
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+        size_t N = x.get_size(3);
+
+        size_t dstCHA = this->forward_kernel_->get_size(3);
+        size_t kernelN = this->forward_kernel_->get_size(4);
+
+        this->res_after_apply_kernel_sum_over_.create(RO, E1, dstCHA, N);
+
+        size_t n;
+        for ( n=0; n<N; n++)
+        {
+            hoNDArray<T> currComplexIm(RO, E1, CHA, this->complexIm_.begin()+n*RO*E1*CHA);
+
+            hoNDArray<T> curr_forward_kernel;
+
+            if ( n < kernelN )
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+n*RO*E1*CHA*dstCHA);
+            }
+            else
+            {
+                curr_forward_kernel.create(RO, E1, CHA, dstCHA, this->forward_kernel_->begin()+(kernelN-1)*RO*E1*CHA*dstCHA);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(currComplexIm, curr_forward_kernel, this->res_after_apply_kernel_));
+
+            hoNDArray<T> sumResCurr(RO, E1, dstCHA, this->res_after_apply_kernel_sum_over_.begin()+n*RO*E1*dstCHA);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(this->res_after_apply_kernel_, sumResCurr));
+        }
+
+        // L2 norm
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::dotc(this->res_after_apply_kernel_sum_over_, this->res_after_apply_kernel_sum_over_, obj));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpace2DTOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace3DOperator.h
new file mode 100644
index 0000000..062bc35
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpace3DOperator.h
@@ -0,0 +1,64 @@
+/** \file       gtPlusSPIRITNoNullSpace3DOperator.h
+    \brief      Implement SPIRIT 3D operator without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITNoNullSpaceOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpace3DOperator : public gtPlusSPIRITNoNullSpaceOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITNoNullSpaceOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpace3DOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpace3DOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+protected:
+
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpace3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT 3D operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT 3D operator for ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace3DOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !this->complexIm_Managed_.dimensions_equal(&x) )
+    {
+        this->complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(x, im, this->complexIm_Managed_));
+}
+
+template <typename T> 
+inline bool gtPlusSPIRITNoNullSpace3DOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !this->kspace_Managed_.dimensions_equal(&im) )
+    {
+        this->kspace_Managed_.create(im.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(im, x, this->kspace_Managed_));
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpaceOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpaceOperator.h
new file mode 100644
index 0000000..578e3ac
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITNoNullSpaceOperator.h
@@ -0,0 +1,130 @@
+/** \file       gtPlusSPIRITNoNullSpaceOperator.h
+    \brief      Base class for SPIRIT operators without Null space
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRITOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITNoNullSpaceOperator : public gtPlusSPIRITOperator<T>
+{
+public:
+
+    typedef gtPlusSPIRITOperator<T> BaseClass;
+
+    gtPlusSPIRITNoNullSpaceOperator() : BaseClass() {}
+    virtual ~gtPlusSPIRITNoNullSpaceOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // compute gradient of ||(G-I)x||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)x||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    //using BaseClass::gt_timer1_;
+    //using BaseClass::gt_timer2_;
+    //using BaseClass::gt_timer3_;
+    //using BaseClass::performTiming_;
+    //using BaseClass::gt_exporter_;
+    //using BaseClass::debugFolder_;
+    //using BaseClass::gtPlus_util_;
+    //using BaseClass::gtPlus_util_complex_;
+    //using BaseClass::gtPlus_mem_manager_;
+    //using BaseClass::use_symmetric_spirit_;
+
+protected:
+
+    // [... srcCHA dstCHA]
+    //using BaseClass::forward_kernel_;
+    //using BaseClass::adjoint_kernel_;
+    //using BaseClass::adjoint_forward_kernel_;
+    //using BaseClass::acquired_points_;
+    //using BaseClass::acquired_points_indicator_;
+    //using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    //using BaseClass::kspace_;
+    //using BaseClass::complexIm_;
+    //using BaseClass::res_after_apply_kernel_;
+    //using BaseClass::res_after_apply_kernel_sum_over_;
+
+    //using BaseClass::kspace_Managed_;
+    //using BaseClass::complexIm_Managed_;
+    //using BaseClass::res_after_apply_kernel_Managed_;
+    //using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRITNoNullSpaceOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT operator without null space constraint ------------------" << endl;
+    os << "Implementation of SPIRIT operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpaceOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*(G-I)'(G-I)x
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(this->complexIm_, *this->adjoint_forward_kernel_, this->res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(this->res_after_apply_kernel_, this->res_after_apply_kernel_sum_over_));
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(this->res_after_apply_kernel_sum_over_, g));
+
+        // multiply by 2
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(T(2.0), g));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpaceOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITNoNullSpaceOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)x||2
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, this->complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(this->complexIm_, *this->forward_kernel_, this->res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(this->res_after_apply_kernel_, this->res_after_apply_kernel_sum_over_));
+
+        // L2 norm
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::dotc(this->res_after_apply_kernel_sum_over_, this->res_after_apply_kernel_sum_over_, obj));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITNoNullSpaceOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h b/toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h
new file mode 100644
index 0000000..89eb95a
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusSPIRITOperator.h
@@ -0,0 +1,422 @@
+/** \file       gtPlusSPIRITOperator.h
+    \brief      Implement SPIRIT operator functinalities
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSPIRIT.h"
+#include "gtPlusOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusSPIRITOperator : public gtPlusSPIRIT<T>, public gtPlusOperator<T>
+{
+public:
+
+    typedef gtPlusOperator<T> BaseClass;
+
+    gtPlusSPIRITOperator() : use_symmetric_spirit_(false), use_non_centered_fft_(false), BaseClass() {}
+    virtual ~gtPlusSPIRITOperator() {}
+
+    virtual void printInfo(std::ostream& os);
+
+    // set forward kernel, compute the adjoint and adjoint_forward kernel
+    bool setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel=true);
+    bool setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel);
+
+    hoNDArray<T>* getAdjointKernel();
+    hoNDArray<T>* getAdjointForwardKernel();
+
+    // apply Dc(G-I)'(G-I)Dc' to x
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // compute right hand side
+    // b = -Dc(G-I)'(G-I)D'x
+    virtual bool computeRighHandSide(const hoNDArray<T>& x, hoNDArray<T>& b);
+
+    // compute gradient of ||(G-I)(Dc'x+D'y)||2
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute cost value of L2 norm ||(G-I)(Dc'x+D'y)||2
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im) = 0;
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x) = 0;
+
+    // whether to use symmetric spirit equation
+    // symmetric equation: A = Dc(G-I)'(G-I)Dc'
+    // non-symmetric equation: A = (G-I)Dc'
+    bool use_symmetric_spirit_;
+
+    // if true, use the fft. not fftc
+    bool use_non_centered_fft_;
+
+    using gtPlusSPIRIT<T>::calib_use_gpu_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // G-I, [... srcCHA dstCHA]
+    boost::shared_ptr< hoNDArray<T> > forward_kernel_;
+    // (G-I)', [... dstCHA srcCHA]
+    boost::shared_ptr< hoNDArray<T> > adjoint_kernel_;
+    // (G-I)'(G-I), [... dstCHA dstCHA]
+    boost::shared_ptr< hoNDArray<T> > adjoint_forward_kernel_;
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+void gtPlusSPIRITOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD SPIRIT operator ------------------" << endl;
+    os << "Implementation of SPIRIT operator for ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline hoNDArray<T>* gtPlusSPIRITOperator<T>::getAdjointKernel()
+{
+    return adjoint_kernel_.get();
+}
+
+template <typename T> 
+inline hoNDArray<T>* gtPlusSPIRITOperator<T>::getAdjointForwardKernel()
+{
+    return adjoint_forward_kernel_.get();
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::
+setForwardKernel(boost::shared_ptr< hoNDArray<T> >& forward_kernel, bool computeAdjForwardKernel)
+{
+    try
+    {
+        forward_kernel_ = forward_kernel;
+
+        adjoint_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>());
+        GADGET_CHECK_RETURN_FALSE(this->imageDomainAdjointKernel(*forward_kernel_, *adjoint_kernel_));
+
+        if ( computeAdjForwardKernel || use_symmetric_spirit_ )
+        {
+            adjoint_forward_kernel_ = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>());
+            GADGET_CHECK_RETURN_FALSE(this->AdjointForwardKernel(*adjoint_kernel_, *forward_kernel_, *adjoint_forward_kernel_));
+        }
+
+        // allocate the helper memory
+        boost::shared_ptr< std::vector<size_t> > dims = forward_kernel->get_dimensions();
+        size_t NDim = dims->size();
+
+        std::vector<size_t> dimSrc(NDim-1), dimDst(NDim-1);
+        size_t ii;
+        for ( ii=0; ii<NDim-2; ii++ )
+        {
+            dimSrc[ii] = (*dims)[ii];
+            dimDst[ii] = (*dims)[ii];
+        }
+
+        dimSrc[NDim-2] = (*dims)[NDim-2];
+        dimDst[NDim-2] = (*dims)[NDim-1];
+
+        kspace_.create(dimSrc);
+        complexIm_.create(dimSrc);
+        res_after_apply_kernel_.create(dims);
+        res_after_apply_kernel_sum_over_.create(dimDst);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::setForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::
+setAdjointForwardKernel(boost::shared_ptr< hoNDArray<T> >& adjoint_forward_kernel)
+{
+    try
+    {
+        adjoint_forward_kernel_ = adjoint_forward_kernel;
+
+        // allocate the helper memory
+        boost::shared_ptr< std::vector<size_t> > dims = adjoint_forward_kernel_->get_dimensions();
+        size_t NDim = dims->size();
+
+        std::vector<size_t> dimSrc(NDim-1), dimDst(NDim-1);
+        size_t ii;
+        for ( ii=0; ii<NDim-2; ii++ )
+        {
+            dimSrc[ii] = (*dims)[ii];
+            dimDst[ii] = (*dims)[ii];
+        }
+
+        dimSrc[NDim-2] = (*dims)[NDim-2];
+        dimDst[NDim-2] = (*dims)[NDim-1];
+
+        kspace_.create(dimSrc);
+        complexIm_.create(dimSrc);
+        res_after_apply_kernel_.create(dims);
+        res_after_apply_kernel_sum_over_.create(dimDst);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::setAdjointForwardKernel(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        // Dc(G-I)'(G-I)Dc'x
+
+        // Dc'x
+        //gt_timer1_.start("1");
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, x, y));
+        //gt_timer1_.stop();
+
+        //// x to image domain
+        //gt_timer1_.start("2");
+        //GADGET_CHECK_RETURN_FALSE(this->convertToImage(y, complexIm_));
+        //gt_timer1_.stop();
+
+        //// apply kernel and sum
+        //gt_timer1_.start("3");
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexIm_, *adjoint_forward_kernel_, res_after_apply_kernel_));
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("4");
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+        //gt_timer1_.stop();
+
+        //// go back to kspace 
+        //gt_timer1_.start("5");
+        //GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(res_after_apply_kernel_sum_over_, y));
+        //gt_timer1_.stop();
+
+        //// apply Dc
+        //gt_timer1_.start("6");
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, y, y));
+        //gt_timer1_.stop();
+
+        Gadgetron::multiply(unacquired_points_indicator_, x, y);
+
+        // x to image domain
+        this->convertToImage(y, complexIm_);
+
+        // apply kernel and sum
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multipleMultiply(complexIm_, *adjoint_forward_kernel_, res_after_apply_kernel_);
+        }
+        else
+        {
+            Gadgetron::multipleMultiply(complexIm_, *forward_kernel_, res_after_apply_kernel_);
+        }
+
+        Gadgetron::sumOverSecondLastDimension(res_after_apply_kernel_, res_after_apply_kernel_sum_over_);
+
+        // go back to kspace 
+        this->convertToKSpace(res_after_apply_kernel_sum_over_, y);
+
+        // apply Dc
+        if ( use_symmetric_spirit_ )
+        {
+            Gadgetron::multiply(unacquired_points_indicator_, y, y);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::forwardOperator(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        if ( use_symmetric_spirit_ )
+        {
+            // Dc(G-I)'(G-I)Dc' is symmetric
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(x, y));
+        }
+        else
+        {
+            // Dc(G-I)'x
+
+            // x to image domain
+            this->convertToImage(x, complexIm_);
+
+            // apply kernel and sum
+            Gadgetron::multipleMultiply(complexIm_, *adjoint_kernel_, res_after_apply_kernel_);
+            Gadgetron::sumOverSecondLastDimension(res_after_apply_kernel_, res_after_apply_kernel_sum_over_);
+
+            // go back to kspace 
+            this->convertToKSpace(res_after_apply_kernel_sum_over_, y);
+
+            // apply Dc
+            Gadgetron::multiply(unacquired_points_indicator_, y, y);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::adjointOperator(...) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::computeRighHandSide(const hoNDArray<T>& x, hoNDArray<T>& b)
+{
+    try
+    {
+        // symmetric: -Dc(G-I)'(G-I)D'x
+        // non-symmetric: -(G-I)D'x
+
+        // D'x, need to do nothing, acquired points are already in place
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+
+        // apply kernel and sum
+        if ( use_symmetric_spirit_ )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexIm_, *adjoint_forward_kernel_, res_after_apply_kernel_));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexIm_, *forward_kernel_, res_after_apply_kernel_));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(res_after_apply_kernel_sum_over_, b));
+
+        // apply Dc
+        if ( use_symmetric_spirit_ )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, b, b));
+        }
+
+        // multiply by -1
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)(-1.0), b));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::computeRighHandSide(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // gradient of L2 norm is
+        // 2*Dc*(G-I)'(G-I)(D'y+Dc'x)
+
+        // D'y+Dc'x
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, x, kspace_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(*acquired_points_, kspace_, kspace_));
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(kspace_, complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexIm_, *adjoint_forward_kernel_, res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+
+        // go back to kspace 
+        GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(res_after_apply_kernel_sum_over_, g));
+
+        // apply Dc
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, g, g));
+
+        // multiply by 2
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)(2.0), g));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusSPIRITOperator<T>::obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // L2 norm
+        // ||(G-I)(D'y+Dc'x)||2
+
+        // D'y+Dc'x
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, x, kspace_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(*acquired_points_, kspace_, kspace_));
+
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(kspace_, complexIm_));
+
+        // apply kernel and sum
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexIm_, *forward_kernel_, res_after_apply_kernel_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(res_after_apply_kernel_, res_after_apply_kernel_sum_over_));
+
+        // L2 norm
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::dotc(res_after_apply_kernel_sum_over_, res_after_apply_kernel_sum_over_, obj));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusSPIRITOperator<T>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWavelet2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWavelet2DOperator.h
new file mode 100644
index 0000000..79a63dd
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWavelet2DOperator.h
@@ -0,0 +1,370 @@
+/** \file       gtPlusWavelet2DOperator.h
+    \brief      Implement 2D wavelet operator for L1 regularization
+    \author     Hui Xue
+
+    Redundant haar wavelet transformation is implemented here.
+*/
+
+#pragma once
+
+#include "gtPlusWaveletOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWavelet2DOperator : public gtPlusWaveletOperator<T>
+{
+public:
+
+    typedef gtPlusWaveletOperator<T> BaseClass;
+
+    gtPlusWavelet2DOperator();
+    virtual ~gtPlusWavelet2DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    // x: [RO E1 ...]
+    // y: [RO E1 W ...]
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    // x: [RO E1 W ...]
+    // y: [RO E1 ...]
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // perform the redundant haar wavelet forward transform
+    // in : [RO E1], out : [RO E1 1+3*level]
+    bool dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    // perform the redundant haar wavelet inverse transform
+    // in : [RO E1 1+3*level], out : [RO E1]
+    bool idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWavelet2DOperator<T>::gtPlusWavelet2DOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWavelet2DOperator<T>::~gtPlusWavelet2DOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t W = 1+3*numOfWavLevels_;
+
+        std::vector<size_t> dimR(NDim+1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+        dimR[2] = W;
+
+        size_t n;
+        for ( n=2; n<NDim; n++ )
+        {
+            dimR[n+1] = (*dims)[n];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        int t;
+
+        #pragma omp parallel for default(none) private(t) shared(num, RO, E1, W, pX, pY)
+        for ( t=0; t<num; t++ )
+        {
+            hoNDArray<T> in(RO, E1, pX+t*RO*E1);
+            hoNDArray<T> out(RO, E1, W, pY+t*RO*E1*W);
+            this->dwtRedundantHaar(in, out, numOfWavLevels_);
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet2DOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t W = (*dims)[2];
+
+        std::vector<size_t> dimR(NDim-1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+
+        size_t n;
+        for ( n=2; n<NDim-1; n++ )
+        {
+            dimR[n] = (*dims)[n+1];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1*W);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        int t;
+
+        #pragma omp parallel for default(none) private(t) shared(num, RO, E1, W, pX, pY)
+        for ( t=0; t<num; t++ )
+        {
+            hoNDArray<T> in(RO, E1, W, pX+t*RO*E1*W);
+            hoNDArray<T> out(RO, E1, pY+t*RO*E1);
+            this->idwtRedundantHaar(in, out, numOfWavLevels_);
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet2DOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        size_t RO = in.get_size(0);
+        size_t E1 = in.get_size(1);
+
+        T scaleFactor = 0.5;
+
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1);
+
+        for (size_t n=0; n<level; n++)
+        {
+            T* LH = pOut + (3*n+1)*RO*E1;
+
+            int ro;
+            #pragma omp parallel for default(none) private(ro) shared(RO, E1, pOut, LH)
+            for (ro=0; ro<RO; ro++)
+            {
+                T v1 = pOut[ro];
+
+                int ii=ro, e1;
+                for (e1=0; e1<E1-1; e1++)
+                {
+                    LH[ii] = pOut[ii] - pOut[ii+RO];
+                    pOut[ii] += pOut[ii+RO];
+                    ii+=RO;
+                }
+
+                LH[ii] = pOut[ii] - v1;
+                pOut[ii] += v1;
+            }
+
+            Gadgetron::scal( scaleFactor, pOut, RO*E1);
+            Gadgetron::scal( scaleFactor, LH, RO*E1);
+
+            T* HL = LH + RO*E1;
+            T* HH = HL + RO*E1;
+
+            int e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, pOut, LH, HL, HH)
+            for (e1=0; e1<E1; e1++)
+            {
+                T v1 = pOut[e1*RO];
+                T v2 = LH[e1*RO];
+
+                size_t ii = e1*RO;
+                for (int ro=0; ro<RO-1; ro++)
+                {
+                    HH[ii] = LH[ii] - LH[ii+1];
+                    LH[ii] += LH[ii+1];
+
+                    HL[ii] = pOut[ii] - pOut[ii+1];
+                    pOut[ii] += pOut[ii+1];
+
+                    ii++;
+                }
+
+                HH[ii] = LH[ii] - v2;
+                LH[ii] += v2;
+
+                HL[ii] = pOut[ii] - v1;
+                pOut[ii] += v1;
+            }
+
+            Gadgetron::scal( scaleFactor, pOut, RO*E1);
+            Gadgetron::scal( scaleFactor, LH, RO*E1);
+            Gadgetron::scal( scaleFactor, HL, RO*E1);
+            Gadgetron::scal( scaleFactor, HH, RO*E1);
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet2DOperator<T>::dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet2DOperator<T>::
+idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        size_t RO = in.get_size(0);
+        size_t E1 = in.get_size(1);
+
+        T* pIn = const_cast<T*>(in.begin());
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1);
+
+        hoNDArray<T> tmp(RO*E1);
+        T* pTmp = tmp.begin();
+
+        T scaleFactor = 0.5;
+
+        int n;
+        for (n=level-1; n>=0; n--)
+        {
+            T* LH = pIn + (3*n+1)*RO*E1;
+            T* HL = LH + RO*E1;
+            T* HH = HL + RO*E1;
+
+            int e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, pOut, LH, HL, HH, pTmp)
+            for (e1=0; e1<E1; e1++)
+            {
+                size_t ii = e1*RO+RO-1;
+
+                T vLL = pOut[ii];
+                T vLH = LH[ii];
+                T vHL = HL[ii];
+                T vHH = HH[ii];
+
+                for (int ro=RO-1; ro>0; ro--)
+                {
+                    // ii = e1*RO + ro;
+                    pOut[ii] += pOut[ii-1] + HL[ii] - HL[ii-1];
+                    pTmp[ii] = LH[ii] + LH[ii-1] + HH[ii] - HH[ii-1];
+
+                    ii--;
+                }
+
+                // ii -= 1;
+                /*pOut[ii] += HL[ii] + vLL - vLH;
+                pTmp [ii] = LH[ii] + HH[ii] + vHL - vHH;*/
+
+                pOut[ii] += vLL + HL[ii] - vHL;
+                pTmp [ii] = LH[ii] + vLH + HH[ii] - vHH;
+            }
+
+            Gadgetron::scal( scaleFactor, pOut, RO*E1);
+            Gadgetron::scal( scaleFactor, pTmp, RO*E1);
+
+            int ro;
+            #pragma omp parallel for default(none) private(ro) shared(RO, E1, pOut, pTmp)
+            for (ro=0; ro<RO; ro++)
+            {
+                size_t ii = (E1-1)*RO+ro;
+                T vLL = pOut[ii];
+                T vLH = pTmp [ii];
+
+                for (int e1=E1-1; e1>0; e1--)
+                {
+                    // ii = e1*RO + ro;
+                    pOut[ii] += pTmp[ii] + pOut[ii-RO] - pTmp[ii-RO];
+                    ii -= RO;
+                }
+
+                pOut[ro] += pTmp[ro] + vLL - vLH;
+            }
+
+            Gadgetron::scal( scaleFactor, pOut, RO*E1);
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet2DOperator<T>::idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+void gtPlusWavelet2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 2D operator --------------------" << endl;
+    os << "Wavelet 2D operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWavelet3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWavelet3DOperator.h
new file mode 100644
index 0000000..c9d4b5d
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWavelet3DOperator.h
@@ -0,0 +1,1170 @@
+/** \file       gtPlusWavelet3DOperator.h
+    \brief      Implement 3D wavelet operator for L1 regularization
+    \author     Hui Xue
+
+    Redundant haar wavelet transformation is implemented here.
+*/
+
+#pragma once
+
+#include "gtPlusWaveletOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWavelet3DOperator : public gtPlusWaveletOperator<T>
+{
+public:
+
+    typedef gtPlusWaveletOperator<T> BaseClass;
+
+    gtPlusWavelet3DOperator();
+    virtual ~gtPlusWavelet3DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    // x : [RO E1 CHA E2 ...]
+    // y : [RO E1 E2 W CHA ...]
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y);
+
+    // perform the redundant haar wavelet forward transform
+    // in : [RO E1 E2], out : [RO E1 E2 1+7*level]
+    bool dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    // perform the redundant haar wavelet inverse transform
+    // in : [RO E1 E2 1+7*level], out : [RO E1 E2]
+    bool idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level);
+
+    // compute L1 norm of wavelet coefficients across CHA
+    // waveCoeff: [RO E1 E2 W CHA ...], W is the wavelet coefficient dimension (e.g. for 1 level wavelet decomposition, W=4 for 2D and W=8 for 3D)
+    // the W=1 wavelet coefficient is the most low frequent coefficients
+    virtual bool L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm);
+
+    // to compute the gradient of wavelet term, divide the wavelet coefficients by its norm
+    // if processApproxCoeff = true, the most low frequent coefficients are changed; otherwise, remains unchanged
+    virtual bool divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff=false);
+
+    // soft-threshold or shrink the wavelet coefficients
+    // the really applied threshold is mask.*thres
+    virtual bool shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff=false);
+
+    // if the sensitivity S is set, compute gradient of ||wav*F'*S'*(Dc'x+D'y)||1
+    // if not, compute gradient of ||wav*F'*(Dc'x+D'y)||1
+    // x represents the unacquired kspace points [RO E1 CHA E2]
+    // virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*F'*S'*(Dc'x+D'y)||1
+    // if not, compute cost value of L2 norm ||wav*F'*(Dc'x+D'y)||1
+    // virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    // scaling along RO
+    bool firstDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor);
+    // scaling along E1
+    bool secondDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor);
+    // scaling along E2
+    bool thirdDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor);
+
+    // because the spatial resolution of images are often different in through-plane dimension than the other two dimensions
+    // sometime it is good to take this into account, so the regularization effects are more isotropic
+    // Here only simple scaling factors are used
+    // More generally, a weighting matrix can be concatenated with wavelet coefficients to enhance or suppress regularization effects as needed
+    // the regularization term can become ||W*wav*F'*(Dc'x+D'y)||1, W is the general weighting matrix
+    // in the next version, we shall extend this class with more geneal weighting strategy
+    T scale_factor_first_dimension_;
+    T scale_factor_second_dimension_;
+    T scale_factor_third_dimension_;
+
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // compute gradient on the assembled kspace
+    virtual bool gradTask(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute the obj on the assembled kspace
+    virtual bool objTask(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::complexIm_norm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::wav_coeff_norm_approx_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWavelet3DOperator<T>::gtPlusWavelet3DOperator() : 
+        scale_factor_first_dimension_(1.0), 
+        scale_factor_second_dimension_(1.0), 
+        scale_factor_third_dimension_(1.0), 
+        BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWavelet3DOperator<T>::~gtPlusWavelet3DOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t CHA = (*dims)[2];
+        size_t E2 = (*dims)[3];
+        size_t W = 1+7*numOfWavLevels_;
+
+        std::vector<size_t> dimR(NDim+1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+        dimR[2] = E2;
+        dimR[3] = W;
+        dimR[4] = CHA;
+
+        size_t n;
+        for ( n=4; n<NDim; n++ )
+        {
+            dimR[n+1] = (*dims)[n];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1*E2*CHA);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        int t;
+
+        if ( CHA == 1 )
+        {
+            #pragma omp parallel for default(none) private(t) shared(num, RO, E1, E2, W, pX, pY) if ( num > 1)
+            for ( t=0; t<num; t++ )
+            {
+                hoNDArray<T> in(RO, E1, E2, pX+t*RO*E1*E2);
+                hoNDArray<T> out(RO, E1, E2, W, pY+t*RO*E1*E2*W);
+                this->dwtRedundantHaar(in, out, numOfWavLevels_);
+            }
+        }
+        else
+        {
+            #pragma omp parallel default(none) private(t) shared(num, RO, E1, CHA, E2, W, pX, pY) if ( num > 1 )
+            {
+                hoNDArray<T> inPermute(RO, E1, E2, CHA);
+
+                #pragma omp for
+                for ( t=0; t<num; t++ )
+                {
+                    hoNDArray<T> in(RO, E1, CHA, E2, pX+t*RO*E1*CHA*E2);
+                    Gadgetron::permuteLastTwoDimensions(in, inPermute);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> in_dwt(RO, E1, E2, inPermute.begin()+cha*RO*E1*E2);
+                        hoNDArray<T> out(RO, E1, E2, W, pY+t*RO*E1*E2*W*CHA+cha*RO*E1*E2*W);
+
+                        this->dwtRedundantHaar(in_dwt, out, numOfWavLevels_);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = x.get_dimensions();
+        size_t NDim = dims->size();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t E2 = (*dims)[2];
+        size_t W = (*dims)[3];
+        size_t CHA = (*dims)[4];
+
+        std::vector<size_t> dimR(NDim-1);
+        dimR[0] = RO;
+        dimR[1] = E1;
+        dimR[2] = CHA;
+        dimR[3] = E2;
+
+        size_t n;
+        for ( n=4; n<NDim-1; n++ )
+        {
+            dimR[n] = (*dims)[n+1];
+        }
+
+        if ( !y.dimensions_equal(&dimR) )
+        {
+            y.create(&dimR);
+        }
+
+        size_t num = x.get_number_of_elements()/(RO*E1*E2*W*CHA);
+
+        T* pX = const_cast<T*>(x.begin());
+        T* pY = y.begin();
+
+        int t;
+
+        if ( CHA == 1 )
+        {
+            #pragma omp parallel for default(none) private(t) shared(num, RO, E1, E2, W, pX, pY) if ( num > 1)
+            for ( t=0; t<num; t++ )
+            {
+                hoNDArray<T> in(RO, E1, E2, W, pX+t*RO*E1*E2*W);
+                hoNDArray<T> out(RO, E1, E2, pY+t*RO*E1*E2);
+                this->idwtRedundantHaar(in, out, numOfWavLevels_);
+            }
+        }
+        else
+        {
+            #pragma omp parallel default(none) private(t) shared(num, RO, E1, CHA, E2, W, pX, pY) if ( num > 1 ) num_threads( ((num>16) ? 16 : num))
+            {
+                hoNDArray<T> outPermute(RO, E1, E2, CHA);
+
+                #pragma omp for
+                for ( t=0; t<num; t++ )
+                {
+                    hoNDArray<T> out(RO, E1, CHA, E2, pY+t*RO*E1*CHA*E2);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> in(RO, E1, E2, W, pX+cha*RO*E1*E2*W);
+                        hoNDArray<T> out_idwt(RO, E1, E2, outPermute.begin()+cha*RO*E1*E2);
+
+                        this->idwtRedundantHaar(in, out_idwt, numOfWavLevels_);
+                    }
+
+                    Gadgetron::permuteLastTwoDimensions(outPermute, out);
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = wavCoeff.get_dimensions();
+
+        std::vector<size_t> dimR(*dims);
+        dimR[4] = 1;
+
+        if ( !wavCoeffNorm.dimensions_equal(&dimR) )
+        {
+            wavCoeffNorm.create(&dimR);
+        }
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t E2 = (*dims)[2];
+        size_t W = (*dims)[3];
+        size_t CHA = (*dims)[4];
+
+        // square the coefficients
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyConj(wavCoeff, wavCoeff, complexIm_norm_));
+        // sum over CHA
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver5thDimension(complexIm_norm_, wavCoeffNorm));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+        size_t CHA = wavCoeff.get_size(4);
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal( &wavCoeffNorm ) )
+        {
+            wav_coeff_norm_approx_.create( wavCoeffNorm.get_dimensions() );
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pBuf = wav_coeff_norm_approx_.begin();
+
+        if ( GT_ABS(std::abs(p) - 1.0) < 0.001 )
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = 1.0 / std::sqrt( pCoeffNorm[ii].real() + mu.real() );
+            }
+        }
+        else
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu, p)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = std::pow( (double)(pCoeffNorm[ii].real() + mu.real()), (double)(p.real()/2.0-1.0) );
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver5thDimension(wav_coeff_norm_approx_, wavCoeff, wavCoeff));
+        }
+        else
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver5thDimensionExcept(wav_coeff_norm_approx_, wavCoeff, 0, wavCoeff, true));
+            size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W*CHA);
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, W, CHA) if ( num > 1 )
+            #else
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            #endif
+            {
+
+                #pragma omp for
+                for ( ii=0; ii<num; ii++ )
+                {
+                    hoNDArray<T> wavCoeffNormCurr(RO, E1, E2, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> wavCoeffCurr(RO, E1, E2, W-1, wavCoeff.begin()+ii*RO*E1*E2*W*CHA+cha*RO*E1*E2*W+RO*E1*E2);
+                        Gadgetron::multiply(wavCoeffNormCurr, wavCoeffCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = wavCoeff.get_dimensions();
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t E2 = (*dims)[2];
+        size_t W = (*dims)[3];
+        size_t CHA = (*dims)[4];
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal(&wavCoeffNorm) )
+        {
+            wav_coeff_norm_approx_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        if ( !res_after_apply_kernel_.dimensions_equal(&wavCoeffNorm) )
+        {
+            res_after_apply_kernel_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+        long long N4D = RO*E1*E2*W;
+
+        long long num = N/N4D;
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pMag = wav_coeff_norm_approx_.begin();
+        T* pMagInv = res_after_apply_kernel_.begin();
+
+        #pragma omp parallel for default(none) private(ii) shared(N, pMag, pMagInv, pCoeffNorm)
+        for ( ii=0; ii<N; ii++ )
+        {
+            pMag[ii] = std::sqrt( pCoeffNorm[ii].real() );
+            pMagInv[ii] = 1.0/(pMag[ii].real()+DBL_EPSILON);
+        }
+
+        // phase does not change
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver5thDimension(res_after_apply_kernel_, wavCoeff, complexIm_));
+
+        // shrink the magnitude
+        if ( mask.dimensions_equal(&wavCoeffNorm) )
+        {
+            const T* pMask = mask.begin();
+
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                const T* pMaskCurr = pMask + n*N4D;
+                T* pMagCurr = pMag + n*N4D;
+
+                long long nn;
+
+                #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, pMaskCurr, thres)
+                for ( nn=s; nn<N4D; nn++ )
+                {
+                    if ( std::abs(pMagCurr[nn]) < std::abs(thres*pMaskCurr[nn]) )
+                    {
+                        pMagCurr[nn] = 0;
+                    }
+                    else
+                    {
+                        pMagCurr[nn] -= thres;
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                T* pMagCurr = pMag + n*N4D;
+
+                long long nn;
+                #pragma omp parallel for private(nn) shared(s, N4D, pMagCurr, thres)
+                for ( nn=s; nn<N4D; nn++ )
+                {
+                    if ( std::abs(pMagCurr[nn]) < std::abs(thres) )
+                    {
+                        pMagCurr[nn] = 0;
+                    }
+                    else
+                    {
+                        pMagCurr[nn] -= thres;
+                    }
+                }
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver5thDimension(wav_coeff_norm_approx_, complexIm_, wavCoeff));
+        }
+        else
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver5thDimensionExcept(wav_coeff_norm_approx_, complexIm_, 0, wavCoeff, false));
+
+            size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W*CHA);
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, W, CHA) if ( num > 1 )
+            #else
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, E2, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            #endif
+            {
+
+                #pragma omp for
+                for ( ii=0; ii<num; ii++ )
+                {
+                    hoNDArray<T> magCurr(RO, E1, E2, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*E2*W+RO*E1*E2);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> phaseCurr(RO, E1, E2, W-1, complexIm_.begin()+ii*RO*E1*E2*W*CHA+cha*RO*E1*E2*W+RO*E1*E2);
+                        hoNDArray<T> wavCoeffCurr(RO, E1, E2, W-1, wavCoeff.begin()+ii*RO*E1*E2*W*CHA+cha*RO*E1*E2*W+RO*E1*E2);
+
+                        Gadgetron::multiply(magCurr, phaseCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        size_t RO = in.get_size(0);
+        size_t E1 = in.get_size(1);
+        size_t E2 = in.get_size(2);
+
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1*E2);
+
+        size_t N2D = RO*E1;
+        size_t N3D = RO*E1*E2;
+
+        for (size_t n=0; n<level; n++)
+        {
+            T* lll = pOut;
+            T* llh = lll + n*7*N3D + N3D;
+            T* lhl = llh + N3D;
+            T* lhh = lhl + N3D;
+            T* hll = lhh + N3D;
+            T* hlh = hll + N3D;
+            T* hhl = hlh + N3D;
+            T* hhh = hhl + N3D;
+
+            long long e2;
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, lll, llh)
+            for (e2=0; e2<E2; e2++)
+            {
+                long long ind3D = e2 * N2D;
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    T v1 = lll[ro + ind3D];
+
+                    long long ind = ro + ind3D;
+                    for (long long e1=0; e1<E1-1; e1++)
+                    {
+                        llh[ind] = lll[ind] - lll[ind+RO];
+                        lll[ind] += lll[ind+RO];
+                        ind += RO;
+                    }
+
+                    llh[ind] = lll[ind] - v1;
+                    lll[ind] += v1;
+                }
+            }
+
+            Gadgetron::scal( 0.5, lll, N3D);
+            Gadgetron::scal( 0.5, llh, N3D);
+
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, lll, llh, lhh, lhl)
+            for (e2=0; e2<E2; e2++)
+            {
+                long long ind3D = e2*N2D;
+                for (long long e1=0; e1<E1; e1++)
+                {
+                    T v1 = lll[e1*RO + ind3D];
+                    T v2 = llh[e1*RO + ind3D];
+
+                    long long ind = e1*RO + ind3D;
+                    for (long long ro=0; ro<RO-1; ro++)
+                    {
+                        lhh[ind] = llh[ind] - llh[ind + 1];
+                        llh[ind] += llh[ind + 1];
+
+                        lhl[ind] = lll[ind] - lll[ind + 1];
+                        lll[ind] += lll[ind + 1];
+
+                        ind++;
+                    }
+
+                    lhl[ind] = lll[ind] - v1;
+                    lll[ind] += v1;
+
+                    lhh[ind] = llh[ind] - v2;
+                    llh[ind] += v2;
+                }
+            }
+
+            #pragma omp parallel sections
+            {
+                #pragma omp section
+                Gadgetron::scal( 0.5, lll, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, lhl, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, llh, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, lhh, N3D);
+            }
+
+            long long e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, E2, N2D, lll, hll, lhl, hhl, llh, hlh, lhh, hhh)
+            for (e1=0; e1<E1; e1++)
+            {
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    long long ind2D = e1*RO + ro;
+
+                    T v1 = lll[ind2D];
+                    T v2 = lhl[ind2D];
+                    T v3 = llh[ind2D];
+                    T v4 = lhh[ind2D];
+
+                    long long ind = ind2D;
+                    for (long long e2=0; e2<E2-1; e2++)
+                    {
+                        hll[ind] = lll[ind] - lll[ind + N2D];
+                        lll[ind] += lll[ind + N2D];
+
+                        hhl[ind] = lhl[ind] - lhl[ind + N2D];
+                        lhl[ind] += lhl[ind + N2D];
+
+                        hlh[ind] = llh[ind] - llh[ind + N2D];
+                        llh[ind] += llh[ind + N2D];
+
+                        hhh[ind] = lhh[ind] - lhh[ind + N2D];
+                        lhh[ind] += lhh[ind + N2D];
+
+                        ind += N2D;
+                    }
+
+                    if ( E2 > 1 )
+                    {
+                        hll[ind] = lll[ind] - v1;
+                        lll[ind] += v1;
+
+                        hhl[ind] = lhl[ind] - v2;
+                        lhl[ind] += v2;
+
+                        hlh[ind] = llh[ind] - v3;
+                        llh[ind] += v3;
+
+                        hhh[ind] = lhh[ind] - v4;
+                        lhh[ind] += v4;
+                    }
+                }
+            }
+
+            #pragma omp parallel sections
+            {
+                #pragma omp section
+                Gadgetron::scal( 0.5, lll, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, hll, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, lhl, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, hhl, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, llh, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, hlh, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, lhh, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, hhh, N3D);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::dwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level)
+{
+    try
+    {
+        size_t RO = in.get_size(0);
+        size_t E1 = in.get_size(1);
+        size_t E2 = in.get_size(2);
+
+        T* pIn = const_cast<T*>(in.begin());
+        T* pOut = out.begin();
+        memcpy(pOut, in.begin(), sizeof(T)*RO*E1*E2);
+
+        size_t N2D = RO*E1;
+        size_t N3D = RO*E1*E2;
+
+        hoNDArray<T> LL(N3D);
+        T* pLL = LL.begin();
+
+        hoNDArray<T> HL(N3D);
+        T* pHL = HL.begin();
+
+        hoNDArray<T> LH(N3D);
+        T* pLH = LH.begin();
+
+        hoNDArray<T> HH(N3D);
+        T* pHH = HH.begin();
+
+        int n;
+        for (n=level-1; n>=0; n--)
+        {
+            T* lll = pOut;
+            T* llh = pIn + n*7*N3D + N3D;
+            T* lhl = llh + N3D;
+            T* lhh = lhl + N3D;
+            T* hll = lhh + N3D;
+            T* hlh = hll + N3D;
+            T* hhl = hlh + N3D;
+            T* hhh = hhl + N3D;
+
+            long long e1;
+            #pragma omp parallel for default(none) private(e1) shared(RO, E1, E2, N2D, lll, hll, lhl, hhl, llh, hlh, lhh, hhh, pLL, pHL, pLH, pHH) 
+            for (e1=0; e1<E1; e1++)
+            {
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    long long ind2D = e1*RO + ro;
+
+                    long long ind;
+                    for (long long e2=E2-1; e2>0; e2--)
+                    {
+                        ind = ind2D + e2*N2D;
+                        pLL[ind] = lll[ind]+lll[ind-N2D] + hll[ind]-hll[ind-N2D];
+                        pHL[ind] = lhl[ind]+lhl[ind-N2D] + hhl[ind]-hhl[ind-N2D];
+                        pLH[ind] = llh[ind]+llh[ind-N2D] + hlh[ind]-hlh[ind-N2D];
+                        pHH[ind] = lhh[ind]+lhh[ind-N2D] + hhh[ind]-hhh[ind-N2D];
+                    }
+
+                    if ( E2 > 1 )
+                    {
+                        ind = ind2D + (E2-1)*N2D;
+                        pLL[ind2D] = lll[ind2D]+lll[ind] + hll[ind2D]-hll[ind];
+                        pHL[ind2D] = lhl[ind2D]+lhl[ind] + hhl[ind2D]-hhl[ind];
+                        pLH[ind2D] = llh[ind2D]+llh[ind] + hlh[ind2D]-hlh[ind];
+                        pHH[ind2D] = lhh[ind2D]+lhh[ind] + hhh[ind2D]-hhh[ind];
+                    }
+                }
+            }
+
+            #pragma omp parallel sections
+            {
+                #pragma omp section
+                Gadgetron::scal( 0.5, pLL, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, pHL, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, pLH, N3D);
+
+                #pragma omp section
+                Gadgetron::scal( 0.5, pHH, N3D);
+            }
+
+            long long e2;
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, pLL, pHL, pLH, pHH) 
+            for (e2=0; e2<E2; e2++)
+            {
+                int ind3D = e2*N2D;
+                for (long long e1=0; e1<E1; e1++)
+                {
+                    int ind = e1*RO + RO-1 + ind3D;
+
+                    T v1 = pLL[ind];
+                    T v2 = pLH[ind];
+
+                    for (long long ro=RO-1; ro>0; ro--)
+                    {
+                        pLL[ind] = pLL[ind]+pLL[ind-1] + pHL[ind]-pHL[ind-1];
+                        pLH[ind] = pLH[ind]+pLH[ind-1] + pHH[ind]-pHH[ind-1];
+                        ind--;
+                    }
+
+                    pLL[ind] = pLL[ind]+v1 + pHL[ind]-pHL[ind+RO-1];
+                    pLH[ind] = pLH[ind]+v2 + pHH[ind]-pHH[ind+RO-1];
+                }
+            }
+
+            Gadgetron::scal( 0.5, pLL, N3D);
+            Gadgetron::scal( 0.5, pLH, N3D);
+
+            #pragma omp parallel for default(none) private(e2) shared(RO, E1, E2, N2D, pLL,pLH, pOut) 
+            for (e2=0; e2<E2; e2++)
+            {
+                int ind3D = e2*N2D;
+                for (long long ro=0; ro<RO; ro++)
+                {
+                    int ind = (E1-1)*RO + ro + ind3D;
+                    for (long long e1=E1-1; e1>0; e1--)
+                    {
+                        pOut[ind] = pLL[ind]+pLL[ind-RO] + pLH[ind]-pLH[ind-RO];
+                        ind -= RO;
+                    }
+
+                    pOut[ind] = pLL[ind]+pLL[ind+(E1-1)*RO] + pLH[ind]-pLH[ind+(E1-1)*RO];
+                }
+            }
+
+            Gadgetron::scal( 0.5, pOut, N3D);
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::idwtRedundantHaar(const hoNDArray<T>& in, hoNDArray<T>& out, size_t level) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+gradTask(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // x to image domain
+        //gt_timer2_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+        //gt_timer2_.stop();
+
+        size_t RO = complexIm_.get_size(0);
+        size_t E1 = complexIm_.get_size(1);
+        size_t CHA = complexIm_.get_size(2);
+        size_t E2 = complexIm_.get_size(3);
+
+        // compute the gradient
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            //gt_timer2_.start("4");
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+            //gt_timer2_.stop();
+
+            //gt_timer2_.start("5");
+            hoNDArray<T> combined(RO, E1, 1, E2, res_after_apply_kernel_.begin());
+            //gt_timer2_.stop();
+
+            // compute wavelet transform
+            //gt_timer2_.start("6");
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        // modify coefficients
+        //gt_timer2_.start("7");
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(res_after_apply_kernel_sum_over_, wav_coeff_norm_));
+        //gt_timer2_.stop();
+
+        //gt_timer2_.start("8");
+        GADGET_CHECK_RETURN_FALSE(this->divideWavCoeffByNorm(res_after_apply_kernel_sum_over_, wav_coeff_norm_, T(1e-15), T(1.0), with_approx_coeff_));
+        //gt_timer2_.stop();
+
+        // first dimension scaling
+        //gt_timer2_.start("9");
+        if ( GT_ABS(std::abs(scale_factor_first_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->firstDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_first_dimension_));
+        }
+
+        // second dimension scaling
+        if ( GT_ABS(std::abs(scale_factor_second_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->secondDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_second_dimension_));
+        }
+
+        // third dimension scaling
+        if ( GT_ABS(std::abs(scale_factor_third_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->thirdDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_third_dimension_));
+        }
+        //gt_timer2_.stop();
+
+        // go back to image
+        //gt_timer2_.start("10");
+        GADGET_CHECK_RETURN_FALSE(this->adjointOperator(res_after_apply_kernel_sum_over_, complexIm_wav_));
+        //gt_timer2_.stop();
+
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // apply coil sensivity
+            //gt_timer2_.start("11");
+            if ( !kspace_wav_.dimensions_equal(&complexIm_) )
+            {
+                kspace_wav_.create(RO, E1, CHA, E2);
+            }
+
+            for ( size_t e2=0; e2<E2; e2++ )
+            {
+                hoNDArray<T> complexImE2(RO, E1, complexIm_wav_.begin()+e2*RO*E1);
+                hoNDArray<T> kspace_wavE2(RO, E1, CHA, kspace_wav_.begin()+e2*RO*E1*CHA);
+
+                if ( coil_senMap_->get_size(3) == E2 )
+                {
+                    hoNDArray<T> coilMapE2(RO, E1, CHA, coil_senMap_->begin()+e2*RO*E1*CHA);
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexImE2, coilMapE2, kspace_wavE2));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexImE2, *coil_senMap_, kspace_wavE2));
+                }
+            }
+            //gt_timer2_.stop();
+
+            // go to kspace
+            //gt_timer2_.start("12");
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(kspace_wav_, g));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            // go to kspace
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(complexIm_wav_, g));
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::gradTask(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+objTask(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // x to image domain
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+
+        size_t RO = complexIm_.get_size(0);
+        size_t E1 = complexIm_.get_size(1);
+        size_t CHA = complexIm_.get_size(2);
+        size_t E2 = complexIm_.get_size(3);
+
+        // apply sensitivity
+        if (  coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+
+            hoNDArray<T> combined(RO, E1, 1, E2, res_after_apply_kernel_.begin());
+
+            // compute wavelet transform
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        if ( GT_ABS(std::abs(scale_factor_third_dimension_)-1.0) > 1e-6 )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->thirdDimensionScale(res_after_apply_kernel_sum_over_, scale_factor_third_dimension_));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->L1NormTotal(res_after_apply_kernel_sum_over_, wav_coeff_norm_, obj));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::objTask(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+firstDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+
+        size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W);
+
+        // coeff 2, 3, 6, 7 are for RO high frequency
+
+        size_t ii;
+        for ( ii=0; ii<num; ii++ )
+        {
+            for ( size_t n=0; n<numOfWavLevels_; n++ )
+            {
+                // 2, 3
+                hoNDArray<T> coeff(RO, E1, E2, 2, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+2)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff);
+
+                // 6, 7
+                hoNDArray<T> coeff2(RO, E1, E2, 2, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+6)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff2);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::firstDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+secondDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+
+        size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W);
+
+        // coeff 1, 3, 5, 7 are for E1 high frequency
+
+        size_t ii;
+        for ( ii=0; ii<num; ii++ )
+        {
+            for ( size_t n=0; n<numOfWavLevels_; n++ )
+            {
+                hoNDArray<T> coeff(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+1)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff);
+
+                hoNDArray<T> coeff1(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+3)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff1);
+
+                hoNDArray<T> coeff2(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+5)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff2);
+
+                hoNDArray<T> coeff3(RO, E1, E2, 1, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+7)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff3);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::secondDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWavelet3DOperator<T>::
+thirdDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t E2 = wavCoeff.get_size(2);
+        size_t W = wavCoeff.get_size(3);
+
+        size_t num = wavCoeff.get_number_of_elements()/(RO*E1*E2*W);
+
+        // coeff 4, 5, 6, 7 are for E2 high frequency
+        size_t ii;
+        for ( ii=0; ii<num; ii++ )
+        {
+            for ( size_t n=0; n<numOfWavLevels_; n++ )
+            {
+                hoNDArray<T> coeff(RO, E1, E2, 4, wavCoeff.begin()+ii*RO*E1*E2*W+(7*n+4)*RO*E1*E2);
+                Gadgetron::scal(scaleFactor, coeff);
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWavelet3DOperator<T>::thirdDimensionScale(hoNDArray<T>& wavCoeff, T& scaleFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWavelet3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 3D operator -----------------------" << endl;
+    os << "Wavelet operator for gtPlus ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace2DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace2DOperator.h
new file mode 100644
index 0000000..3ca1628
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace2DOperator.h
@@ -0,0 +1,118 @@
+/** \file       gtPlusWaveletNoNullSpace2DOperator.h
+    \brief      Implement 2D wavelet operator for without Null space cases
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusWavelet2DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWaveletNoNullSpace2DOperator : public gtPlusWavelet2DOperator<T>
+{
+public:
+
+    typedef gtPlusWavelet2DOperator<T> BaseClass;
+
+    gtPlusWaveletNoNullSpace2DOperator();
+    virtual ~gtPlusWaveletNoNullSpace2DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // if the sensitivity S is set, compute gradient of ||wav*F'*S'*x||1
+    // if not, compute gradient of ||wav*F'*x||1
+    // x represents the unacquired kspace points [RO E1 CHA]
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*F'*S'*x||1
+    // if not, compute cost value of L2 norm ||wav*F'*x||1
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWaveletNoNullSpace2DOperator<T>::gtPlusWaveletNoNullSpace2DOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWaveletNoNullSpace2DOperator<T>::~gtPlusWaveletNoNullSpace2DOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWaveletNoNullSpace2DOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->gradTask(x, g));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletNoNullSpace2DOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletNoNullSpace2DOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->objTask(x, obj));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletNoNullSpace2DOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWaveletNoNullSpace2DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 2D operator --------------------" << endl;
+    os << "Wavelet 2D operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace3DOperator.h b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace3DOperator.h
new file mode 100644
index 0000000..fcd0c47
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWaveletNoNullSpace3DOperator.h
@@ -0,0 +1,119 @@
+/** \file       gtPlusWaveletNoNullSpace3DOperator.h
+    \brief      Implement 3D wavelet operator for without Null space cases
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusWavelet3DOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWaveletNoNullSpace3DOperator : public gtPlusWavelet3DOperator<T>
+{
+public:
+
+    typedef gtPlusWavelet3DOperator<T> BaseClass;
+
+    gtPlusWaveletNoNullSpace3DOperator();
+    virtual ~gtPlusWaveletNoNullSpace3DOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // if the sensitivity S is set, compute gradient of ||wav*F'*S'*x||1
+    // if not, compute gradient of ||wav*F'*x||1
+    // x represents the unacquired kspace points [RO E1 CHA E2]
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*F'*S'*x||1
+    // if not, compute cost value of L2 norm ||wav*F'*x||1
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::scale_factor_third_dimension_;
+    using BaseClass::numOfWavLevels_;
+    using BaseClass::with_approx_coeff_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    using BaseClass::wav_coeff_norm_;
+    using BaseClass::kspace_wav_;
+    using BaseClass::complexIm_wav_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWaveletNoNullSpace3DOperator<T>::gtPlusWaveletNoNullSpace3DOperator() : BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWaveletNoNullSpace3DOperator<T>::~gtPlusWaveletNoNullSpace3DOperator()
+{
+}
+
+template <typename T> 
+inline bool gtPlusWaveletNoNullSpace3DOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->gradTask(x, g));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletNoNullSpace3DOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusWaveletNoNullSpace3DOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->objTask(x, obj));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletNoNullSpace3DOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWaveletNoNullSpace3DOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet 3D operator -----------------------" << endl;
+    os << "Wavelet operator for gtPlus ISMRMRD package" << endl;
+    os << "-------------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h b/toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h
new file mode 100644
index 0000000..f16ab24
--- /dev/null
+++ b/toolboxes/gtplus/algorithm/gtPlusWaveletOperator.h
@@ -0,0 +1,616 @@
+/** \file       gtPlusWaveletOperator.h
+    \brief      Implement wavelet operator for L1 regularization
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusWaveletOperator : public gtPlusOperator<T>
+{
+public:
+
+    typedef gtPlusOperator<T> BaseClass;
+
+    gtPlusWaveletOperator();
+    virtual ~gtPlusWaveletOperator();
+
+    virtual void printInfo(std::ostream& os);
+
+    // forward operator
+    virtual bool forwardOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // adjoint operator
+    virtual bool adjointOperator(const hoNDArray<T>& x, hoNDArray<T>& y) = 0;
+
+    // compute L1 norm of wavelet coefficients across CHA
+    // waveCoeff: [RO E1 W CHA ...], W is the wavelet coefficient dimension (e.g. for 1 level wavelet decomposition, W=4 for 2D and W=8 for 3D)
+    // the W=1 wavelet coefficient is the most low frequent coefficients
+    virtual bool L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm);
+    virtual bool L1NormTotal(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm, T& L1CoeffNorm);
+
+    // to compute the gradient of wavelet term, divide the wavelet coefficients by its norm
+    // if processApproxCoeff = true, the most low frequent coefficients are changed; otherwise, remains unchanged
+    virtual bool divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff=false);
+
+    // soft-threshold or shrink the wavelet coefficients
+    // the really applied threshold is mask.*thres
+    virtual bool shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff=false);
+
+    // if the sensitivity S is set, compute gradient of ||wav*F'*S'*(Dc'x+D'y)||1
+    // if not, compute gradient of ||wav*F'*(Dc'x+D'y)||1
+    // x represents the unacquired kspace points [RO E1 CHA]
+    virtual bool grad(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // if the sensitivity S is set, compute cost value of L2 norm ||wav*F'*S'*(Dc'x+D'y)||1
+    // if not, compute cost value of L2 norm ||wav*F'*(Dc'x+D'y)||1
+    virtual bool obj(const hoNDArray<T>& x, T& obj);
+
+    // number of transformation levels
+    size_t numOfWavLevels_;
+
+    // whether to include low frequency approximation coefficients
+    bool with_approx_coeff_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // convert to image domain or back to kspace
+    virtual bool convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im);
+    virtual bool convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x);
+
+    // compute gradient on the assembled kspace
+    virtual bool gradTask(const hoNDArray<T>& x, hoNDArray<T>& g);
+
+    // compute the obj on the assembled kspace
+    virtual bool objTask(const hoNDArray<T>& x, T& obj);
+
+    using BaseClass::acquired_points_;
+    using BaseClass::acquired_points_indicator_;
+    using BaseClass::unacquired_points_indicator_;
+    using BaseClass::coil_senMap_;
+
+    // helper memory
+    using BaseClass::kspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::res_after_apply_kernel_;
+    using BaseClass::res_after_apply_kernel_sum_over_;
+
+    hoNDArray<T> wav_coeff_norm_;
+    hoNDArray<T> wav_coeff_norm_approx_;
+
+    hoNDArray<T> kspace_wav_;
+    hoNDArray<T> complexIm_wav_;
+    hoNDArray<T> complexIm_norm_;
+
+    using BaseClass::kspace_Managed_;
+    using BaseClass::complexIm_Managed_;
+    using BaseClass::res_after_apply_kernel_Managed_;
+    using BaseClass::res_after_apply_kernel_sum_over_Managed_;
+};
+
+template <typename T> 
+gtPlusWaveletOperator<T>::gtPlusWaveletOperator() : numOfWavLevels_(1), with_approx_coeff_(false), BaseClass()
+{
+
+}
+
+template <typename T> 
+gtPlusWaveletOperator<T>::~gtPlusWaveletOperator()
+{
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dims = wavCoeff.get_dimensions();
+
+        std::vector<size_t> dimR(*dims);
+        dimR[3] = 1;
+
+        if ( !wavCoeffNorm.dimensions_equal(&dimR) )
+        {
+            wavCoeffNorm.create(&dimR);
+        }
+
+        size_t RO = (*dims)[0];
+        size_t E1 = (*dims)[1];
+        size_t W = (*dims)[2];
+        size_t CHA = (*dims)[3];
+
+        // square the coefficients
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyConj(wavCoeff, wavCoeff, complexIm_norm_));
+        // sum over CHA
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(complexIm_norm_, wavCoeffNorm));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::L1Norm(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+L1NormTotal(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm, T& L1CoeffNorm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(wavCoeff, wavCoeffNorm));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sqrt(wavCoeffNorm, wav_coeff_norm_approx_));
+
+        L1CoeffNorm = Gadgetron::asum(&wav_coeff_norm_approx_);
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::L1NormTotal(const hoNDArray<T>& wavCoeff, hoNDArray<T>& wavCoeffNorm, T& L1CoeffNorm) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t W = wavCoeff.get_size(2);
+        size_t CHA = wavCoeff.get_size(3);
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal( &wavCoeffNorm ) )
+        {
+            wav_coeff_norm_approx_.create( wavCoeffNorm.get_dimensions() );
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pBuf = wav_coeff_norm_approx_.begin();
+
+        if ( GT_ABS(std::abs(p) - 1.0) < 0.001 )
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = 1.0 / std::sqrt( pCoeffNorm[ii].real() + mu.real() );
+            }
+        }
+        else
+        {
+            #pragma omp parallel for default(none) private(ii) shared(N, pBuf, pCoeffNorm, mu, p)
+            for ( ii=0; ii<N; ii++ )
+            {
+                pBuf[ii] = std::pow( (double)(pCoeffNorm[ii].real() + mu.real()), (double)(p.real()/2.0-1.0) );
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver4thDimension(wav_coeff_norm_approx_, wavCoeff, wavCoeff));
+        }
+        else
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver4thDimensionExcept(wav_coeff_norm_approx_, wavCoeff, 0, wavCoeff, true));
+            size_t num = wavCoeff.get_number_of_elements()/(RO*E1*W*CHA);
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, num, W, CHA) if ( num > 1 )
+            #else
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeffNorm, wavCoeff, W, CHA) if ( num > 1 )
+            #endif
+            {
+
+                #pragma omp for
+                for ( ii=0; ii<num; ii++ )
+                {
+                    hoNDArray<T> wavCoeffNormCurr(RO, E1, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*W+RO*E1);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> wavCoeffCurr(RO, E1, W-1, wavCoeff.begin()+ii*RO*E1*W*CHA+cha*RO*E1*W+RO*E1);
+                        Gadgetron::multiply(wavCoeffNormCurr, wavCoeffCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::divideWavCoeffByNorm(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T mu, T p, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff)
+{
+    try
+    {
+        size_t RO = wavCoeff.get_size(0);
+        size_t E1 = wavCoeff.get_size(1);
+        size_t W = wavCoeff.get_size(2);
+        size_t CHA = wavCoeff.get_size(3);
+
+        if ( !wav_coeff_norm_approx_.dimensions_equal(&wavCoeffNorm) )
+        {
+            wav_coeff_norm_approx_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        if ( !res_after_apply_kernel_.dimensions_equal(&wavCoeffNorm) )
+        {
+            res_after_apply_kernel_.create(wavCoeffNorm.get_dimensions());
+        }
+
+        long long ii;
+        long long N = (long long)wavCoeffNorm.get_number_of_elements();
+        long long N3D = RO*E1*W;
+
+        long long num = N/N3D;
+
+        const T* pCoeffNorm = wavCoeffNorm.begin();
+        T* pMag = wav_coeff_norm_approx_.begin();
+        T* pMagInv = res_after_apply_kernel_.begin();
+
+        #pragma omp parallel for default(none) private(ii) shared(N, pMag, pMagInv, pCoeffNorm)
+        for ( ii=0; ii<N; ii++ )
+        {
+            pMag[ii] = std::sqrt( pCoeffNorm[ii].real() );
+            pMagInv[ii] = 1.0/(pMag[ii].real()+DBL_EPSILON);
+        }
+
+        // phase does not change
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver4thDimension(res_after_apply_kernel_, wavCoeff, complexIm_));
+
+        // shrink the magnitude
+        if ( mask.dimensions_equal(&wavCoeffNorm) )
+        {
+            const T* pMask = mask.begin();
+
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                const T* pMaskCurr = pMask + n*N3D;
+                T* pMagCurr = pMag + n*N3D;
+
+                long long nn;
+
+                #pragma omp parallel for private(nn) shared(s, N3D, pMagCurr, pMaskCurr, thres)
+                for ( nn=s; nn<N3D; nn++ )
+                {
+                    if ( std::abs(pMagCurr[nn]) < std::abs(thres*pMaskCurr[nn]) )
+                    {
+                        pMagCurr[nn] = 0;
+                    }
+                    else
+                    {
+                        pMagCurr[nn] -= thres;
+                    }
+                }
+            }
+        }
+        else
+        {
+            long long n = 0;
+            for ( n=0; n<num; n++ )
+            {
+                long long s=RO*E1; 
+                if ( processApproxCoeff )
+                {
+                    s = 0;
+                }
+
+                T* pMagCurr = pMag + n*N3D;
+
+                long long nn;
+                #pragma omp parallel for private(nn) shared(s, N3D, pMagCurr, thres)
+                for ( nn=s; nn<N3D; nn++ )
+                {
+                    if ( std::abs(pMagCurr[nn]) < std::abs(thres) )
+                    {
+                        pMagCurr[nn] = 0;
+                    }
+                    else
+                    {
+                        pMagCurr[nn] -= thres;
+                    }
+                }
+            }
+        }
+
+        if ( processApproxCoeff )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver4thDimension(wav_coeff_norm_approx_, complexIm_, wavCoeff));
+        }
+        else
+        {
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyOver4thDimensionExcept(wav_coeff_norm_approx_, complexIm_, 0, wavCoeff, false));
+            num = wavCoeff.get_number_of_elements()/(RO*E1*W*CHA);
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, num, W, CHA) if ( num > 1 )
+            #else
+                #pragma omp parallel default(none) private(ii) shared(RO, E1, num, wavCoeff, W, CHA) if ( num > 1 )
+            #endif
+            {
+                #pragma omp for
+                for ( ii=0; ii<num; ii++ )
+                {
+                    hoNDArray<T> MagCurr(RO, E1, W-1, wav_coeff_norm_approx_.begin()+ii*RO*E1*W+RO*E1);
+
+                    for ( size_t cha=0; cha<CHA; cha++ )
+                    {
+                        hoNDArray<T> phaseCurr(RO, E1, W-1, complexIm_.begin()+ii*RO*E1*W*CHA+cha*RO*E1*W+RO*E1);
+                        hoNDArray<T> wavCoeffCurr(RO, E1, W-1, wavCoeff.begin()+ii*RO*E1*W*CHA+cha*RO*E1*W+RO*E1);
+
+                        Gadgetron::multiply(MagCurr, phaseCurr, wavCoeffCurr);
+                    }
+                }
+            }
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::shrinkWavCoeff(hoNDArray<T>& wavCoeff, const hoNDArray<T>& wavCoeffNorm, T thres, const hoNDArray<T>& mask, bool processApproxCoeff) ... ");
+        return false;
+    }
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+grad(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        // D'y+Dc'x
+        //gt_timer1_.start("1");
+        //vcMul(unacquired_points_indicator_.get_number_of_elements(), 
+        //    reinterpret_cast<MKL_Complex8*>(unacquired_points_indicator_.begin()), 
+        //    reinterpret_cast<const MKL_Complex8*>(x.begin()), 
+        //    reinterpret_cast<MKL_Complex8*>(kspace_.begin()));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, x, kspace_));
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("2");
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(*acquired_points_, kspace_, kspace_));
+        //gt_timer1_.stop();
+
+        // compute the gradient on assembled kspace
+        GADGET_CHECK_RETURN_FALSE(this->gradTask(kspace_, g));
+
+        // only unacquired points are kept
+        //gt_timer1_.start("12");
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, g, g));
+        //gt_timer1_.stop();
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::grad(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+obj(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        // D'y+Dc'x
+        //gt_timer1_.start("1");
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(unacquired_points_indicator_, x, kspace_));
+        //gt_timer1_.stop();
+
+        //gt_timer1_.start("2");
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(*acquired_points_, kspace_, kspace_));
+        //gt_timer1_.stop();
+
+        // compute the objective function on assembled kspace
+        GADGET_CHECK_RETURN_FALSE(this->objTask(kspace_, obj));
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::obj(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+gradTask(const hoNDArray<T>& x, hoNDArray<T>& g)
+{
+    try
+    {
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+
+        // x to image domain
+        //gt_timer2_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+        //gt_timer2_.stop();
+
+        // compute the gradient
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            //gt_timer2_.start("4");
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+            //gt_timer2_.stop();
+
+            //gt_timer2_.start("5");
+            hoNDArray<T> combined(RO, E1, 1, res_after_apply_kernel_.begin());
+            //gt_timer2_.stop();
+
+            // compute wavelet transform
+            //gt_timer2_.start("6");
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        // modify coefficients
+        //gt_timer2_.start("7");
+        GADGET_CHECK_RETURN_FALSE(this->L1Norm(res_after_apply_kernel_sum_over_, wav_coeff_norm_));
+        //gt_timer2_.stop();
+
+        //gt_timer2_.start("8");
+        GADGET_CHECK_RETURN_FALSE(this->divideWavCoeffByNorm(res_after_apply_kernel_sum_over_, wav_coeff_norm_, T(1e-15), T(1.0), with_approx_coeff_));
+        //gt_timer2_.stop();
+
+        // go back to image
+        //gt_timer2_.start("9");
+        GADGET_CHECK_RETURN_FALSE(this->adjointOperator(res_after_apply_kernel_sum_over_, complexIm_wav_));
+        //gt_timer2_.stop();
+
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // apply coil sensivity
+            //gt_timer2_.start("10");
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(complexIm_wav_, *coil_senMap_, kspace_wav_));
+            //gt_timer2_.stop();
+
+            // go to kspace
+            //gt_timer2_.start("11");
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(kspace_wav_, g));
+            //gt_timer2_.stop();
+        }
+        else
+        {
+            // go to kspace
+            GADGET_CHECK_RETURN_FALSE(this->convertToKSpace(complexIm_wav_, g));
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::gradTask(const hoNDArray<T>& x, hoNDArray<T>& g) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusWaveletOperator<T>::
+objTask(const hoNDArray<T>& x, T& obj)
+{
+    try
+    {
+        size_t RO = x.get_size(0);
+        size_t E1 = x.get_size(1);
+        size_t CHA = x.get_size(2);
+
+        // x to image domain
+        //gt_timer3_.start("3");
+        GADGET_CHECK_RETURN_FALSE(this->convertToImage(x, complexIm_));
+        //gt_timer3_.stop();
+
+        // apply sensitivity
+        if ( coil_senMap_ && coil_senMap_->get_size(0)==RO && coil_senMap_->get_size(1)==E1 && coil_senMap_->get_size(2)==CHA )
+        {
+            // perform coil combination
+            //gt_timer3_.start("4");
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_complex_.coilCombine(complexIm_, *coil_senMap_, res_after_apply_kernel_));
+            //gt_timer3_.stop();
+
+            //gt_timer3_.start("5");
+            hoNDArray<T> combined(RO, E1, 1, res_after_apply_kernel_.begin());
+            //gt_timer3_.stop();
+
+            // compute wavelet transform
+            //gt_timer3_.start("6");
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(combined, res_after_apply_kernel_sum_over_));
+            //gt_timer3_.stop();
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(this->forwardOperator(complexIm_, res_after_apply_kernel_sum_over_));
+        }
+
+        //gt_timer3_.start("7");
+        GADGET_CHECK_RETURN_FALSE(this->L1NormTotal(res_after_apply_kernel_sum_over_, wav_coeff_norm_, obj));
+        //gt_timer3_.stop();
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusWaveletOperator<T>::objTask(const hoNDArray<T>& x, T& obj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusWaveletOperator<T>::convertToImage(const hoNDArray<T>& x, hoNDArray<T>& im)
+{
+    if ( !complexIm_Managed_.dimensions_equal(&x) )
+    {
+        complexIm_Managed_.create(x.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(x, im, complexIm_Managed_));
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusWaveletOperator<T>::convertToKSpace(const hoNDArray<T>& im, hoNDArray<T>& x)
+{
+    if ( !kspace_Managed_.dimensions_equal(&im) )
+    {
+        kspace_Managed_.create(im.get_dimensions());
+    }
+
+    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(im, x, kspace_Managed_));
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusWaveletOperator<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD wavelet operator -----------------------" << endl;
+    os << "Wavelet operator for gtPlus ISMRMRD package" << endl;
+    os << "----------------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian.xml
new file mode 100644
index 0000000..fb0185e
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian.xml
@@ -0,0 +1,798 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 2D or 2D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>10240</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml
new file mode 100644
index 0000000..3be2ad1
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on GtPlus Cloud
+        This configuration file configures one gadget to perform the reconstruction for
+        2DT job packages
+
+        Depending on the incoming algorithm parameters, both linear and non-linear reconstruction
+        can be performed
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT/3DT cases, process one job
+    a gtPlusReconJob2DT job consists of kspace, kernel and parameters
+    kspace: [RO E1 CHA E2/PHS]
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob2DTGadget</classname>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml
new file mode 100644
index 0000000..32d6ae5
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_L1SPIRIT.xml
@@ -0,0 +1,808 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing by default is turned on in this configuration file
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+        Magenetic Resonance in Medicine on Dec 2013.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadgetCloud</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.005</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>20.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml
new file mode 100644
index 0000000..52e6269
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_DualLayer_Gateway_SPIRIT.xml
@@ -0,0 +1,808 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear SPIRIT
+        The GtPlus cloud computing by default is turned on in this configuration file
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+        Magenetic Resonance in Medicine on Dec 2013.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadgetCloud</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.005</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml
new file mode 100644
index 0000000..aaf305a
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml
@@ -0,0 +1,269 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear or non-linear SPIRIT
+        The dual-layer cloud topology is used here, therefore every incoming SLICE is sent
+        to one first layer GtPlus cloud node. This first layer node can further split the job and
+        process the SLICE with one or more second layer nodes.
+
+        This configuration file is for the first layer GtPlus cloud node.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+        Magenetic Resonance in Medicine on Dec 2013.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1014</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlus2DTGadgetCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1014</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlus2DTGadgetCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT cases
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob2DTGadgetCloud</classname>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2048</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9004</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT_DualLayer_FirstLayer.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_L1SPIRIT.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_L1SPIRIT.xml
new file mode 100644
index 0000000..5f83c31
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_L1SPIRIT.xml
@@ -0,0 +1,799 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.005</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>20.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>8192</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_SPIRIT.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_SPIRIT.xml
new file mode 100644
index 0000000..22dc832
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Cartesian_SPIRIT.xml
@@ -0,0 +1,799 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 2D cartesian reconstruction using linear SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_NONE</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.005</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>90</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0015</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.0001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_2DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>8192</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_FatWater.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_FatWater.xml
new file mode 100644
index 0000000..b084345
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_FatWater.xml
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on fat water multi-contrast application
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Contrast</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_LGE.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_LGE.xml
new file mode 100644
index 0000000..5009962
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_LGE.xml
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac LGE imaging
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_MOLLI.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_MOLLI.xml
new file mode 100644
index 0000000..300705e
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_MOLLI.xml
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac MOLLI T1 mapping
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>5</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Set</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Perfusion.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Perfusion.xml
new file mode 100644
index 0000000..2606ef8
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_Perfusion.xml
@@ -0,0 +1,655 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on cardiac perfusion mapping
+        The support for AIF acquisition is implemented.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>1</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_RealTimeCine.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_RealTimeCine.xml
new file mode 100644
index 0000000..107775e
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_RealTimeCine.xml
@@ -0,0 +1,741 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on real-time cine imaging
+        The GtPlus supports the on-the-fly reconstruction, therefore the reconstruction starts 
+        whenever sufficient data is received. The reconstructed images are sent out once the 
+        computation is finished.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>5.0</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>20480</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_RealTimeFlow.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_RealTimeFlow.xml
new file mode 100644
index 0000000..684b9e7
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_RealTimeFlow.xml
@@ -0,0 +1,689 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on real-time flow imaging
+        The GtPlus supports the on-the-fly reconstruction, therefore the reconstruction starts 
+        whenever sufficient data is received. The reconstructed images are sent out once the 
+        computation is finished.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Phase</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!--
+    Recon computation for 2DT cases
+
+    kspace_: [RO E1 CHA N S], for 2D recon, N can be 1
+    ref_: [RO E1 CHA M S], M can equal to N or 1 or others
+    fullkspace_: [RO E1 CHA N S]
+    complexIm_: [RO E1 N S], after coil combination
+    coilMap_: [RO E1 CHA 1 or N S]
+    gfactor_: [RO E1 CHA 1 or N S]
+
+    the 4th and 5th dimensions (N and S) needs to be specified. For example,
+    for real-time cine, N = DIM_Phase and S=DIM_Slice
+
+    default behavior
+    a) the coil compression coefficients are computed once across all S
+    b) the kernel or coil sensitivity are estimated for every S
+
+    embedded mode
+    a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA]
+    b) coil combination uses different coil maps for every S
+    c) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+    d) the ref lines are filled back to fullkspace_
+
+    separate mode
+    a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA] if M==N
+    b) if M==1, the kernel is only estimated once for every S
+    c) coil combination uses different coil maps for every S
+    d) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+
+    interleave
+    a) the average-all ref is used
+    b) kernel/coil sensitivity is estimated once for every S
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Phase</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_T2W.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_T2W.xml
new file mode 100644
index 0000000..2c30202
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_2DT_T2W.xml
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 2DT cartesian reconstruction on T2 weigthed cardiac imaging
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>5</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 2DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon2DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_4th</name>
+            <value>DIM_Repetition</value>
+        </property>
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_Set</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>interleaved_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap_useHighestSignal</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>embedded_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_ref_numOfModes</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allS</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>separate_whichS_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allS</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>5</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>1e-5</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_HOMODYNE, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_FENGHUANG</value>
+        </property>
+
+        <!-- parameters for partial fourier homodyne algorithm -->
+        <property>
+            <name>partialFourier_homodyne_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_homodyne_densityComp</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian.xml
new file mode 100644
index 0000000..7fbab73
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian.xml
@@ -0,0 +1,787 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>4</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.01</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>8</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL_ITER</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_GRAPPA</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.005</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>70</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>13000</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml
new file mode 100644
index 0000000..8c4b2b0
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for 3D cartesian reconstruction on GtPlus Cloud
+        This configuration file configures one gadget to perform the reconstruction for
+        3DT job packages
+
+        Depending on the incoming algorithm parameters, both linear and non-linear reconstruction
+        can be performed
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageReaderCPFL</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1013</slot>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusCloudJobMessageWriterCPFL</classname>
+    </writer>
+
+    <!--
+    Recon computation for 2DT/3DT cases, process one job
+    a gtPlusReconJob2DT job consists of kspace, kernel and parameters
+    kspace: [RO E1 CHA E2/PHS]
+    -->
+
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusReconJob3DTGadget</classname>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_L1SPIRIT.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_L1SPIRIT.xml
new file mode 100644
index 0000000..2f21ed7
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_L1SPIRIT.xml
@@ -0,0 +1,816 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction using L1 SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+        The single-layer cloud topology is used here.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+        Magenetic Resonance in Medicine on Dec 2013.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.01</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_L1SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.002</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>32</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2499</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>1</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>2</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 1 -->
+        <property>
+            <name>CloudNode1_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_Port</name>
+            <value>9004</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode1_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_SPIRIT.xml b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_SPIRIT.xml
new file mode 100644
index 0000000..b1c8bbe
--- /dev/null
+++ b/toolboxes/gtplus/config/GadgetronProgram_gtPlus_3DT_Cartesian_SPIRIT.xml
@@ -0,0 +1,795 @@
+<?xml version="1.0" encoding="utf-8"?>
+<gadgetronStreamConfiguration xsi:schemaLocation="http://gadgetron.sf.net/gadgetron gadgetron.xsd"
+        xmlns="http://gadgetron.sf.net/gadgetron"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+    <!-- 
+        GT Plus configuratin file for general 3D or 3D+T cartesian reconstruction using linear SPIRIT
+        The GtPlus cloud computing can be turned on in this configuration file
+        The single-layer cloud topology is used here.
+
+        Author: Hui Xue
+        Magnetic Resonance Technology Program
+        National Heart, Lung and Blood Institute
+        National Institutes of Health
+        10 Center Drive, Bethesda
+        MD 20814
+        USA
+        Email: hui.xue at nih.gov
+
+        Ref to: 
+
+        Hui Xue, Souheil Inati, Thomas Sangild Sorensen, Peter Kellman, Michael S. Hansen. 
+        Distributed MRI Reconstruction using Gadgetron based Cloud Computing. Submitted to
+        Magenetic Resonance in Medicine on Dec 2013.
+    -->
+
+    <!-- reader -->
+    <reader>
+        <slot>1008</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>GadgetIsmrmrdAcquisitionMessageReader</classname>
+    </reader>
+
+    <!-- writer -->
+    <writer>
+        <slot>1004</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterCPLX</classname>
+    </writer>
+    <writer>
+        <slot>1005</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterFLOAT</classname>
+    </writer>
+    <writer>
+        <slot>1006</slot>
+        <dll>gadgetron_mricore</dll>
+        <classname>MRIImageWriterUSHORT</classname>
+    </writer>
+
+    <!-- RO asymmetric echo handling -->
+    <gadget>
+        <name>PartialFourierAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>PartialFourierAdjustROGadget</classname>
+    </gadget>
+
+    <!-- Noise prewhitening -->
+    <gadget>
+        <name>NoiseAdjust</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>NoiseAdjustGadget</classname>
+    </gadget>
+
+    <!-- RO oversampling removal -->
+    <gadget>
+        <name>RemoveROOversampling</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>RemoveROOversamplingGadget</classname>
+    </gadget>
+
+    <!-- Data accumulation and trigger gadget -->
+    <gadget>
+        <name>Acc</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusAccumulatorWorkOrderTriggerGadget</classname>
+
+        <!-- debug and info mode -->
+        <property>
+            <name>verboseMode</name>
+            <value>false</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>noacceleration_triggerDim1</name>
+            <value>DIM_Encoding2</value>
+        </property>
+
+        <property>
+            <name>noacceleration_triggerDim2</name>
+            <value>DIM_Slice</value>
+        </property>
+
+        <property>
+            <name>noacceleration_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>interleaved_numOfKSpace_triggerDim1</name>
+            <value>8</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>embedded_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_triggerDim1</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_triggerDim2</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <property>
+            <name>separate_numOfKSpace_triggerDim1</name>
+            <value>1</value>
+        </property>
+
+        <!-- if 'other' kspace data presents, enforce its dimension matches the image data-->
+        <property>
+            <name>other_kspace_matching_Dim</name>
+            <value>DIM_Repetition</value>
+        </property>
+
+    </gadget>
+
+    <!-- Recon computation for 3DT cases -->
+    <gadget>
+        <name>Recon</name>
+        <dll>gadgetronPlus</dll>
+        <classname>GtPlusRecon3DTGadget</classname>
+
+        <!-- kspace data -->
+        <property>
+            <name>dim_5th</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- work flow -->
+        <property>
+            <name>workOrder_ShareDim</name>
+            <value>DIM_NONE</value>
+        </property>
+
+        <!-- No acceleration mode -->
+        <property>
+            <name>no_acceleration_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>no_acceleration_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>no_acceleration_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Interleaved mode -->
+        <property>
+            <name>interleaved_same_combinationcoeff_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>interleaved_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- Embedded mode -->
+        <property>
+            <name>embedded_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>embedded_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>embedded_ref_fillback</name>
+            <value>true</value>
+        </property>
+
+        <!-- Separate mode -->
+        <property>
+            <name>separate_averageall_ref</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_fullres_coilmap</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_same_combinationcoeff_allN</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>separate_whichN_combinationcoeff</name>
+            <value>0</value>
+        </property>
+
+        <!-- coil compression -->
+        <property>
+            <name>same_coil_compression_coeff_allN</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_thres</name>
+            <value>0.005</value>
+        </property>
+
+        <property>
+            <name>upstream_coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <property>
+            <name>downstream_coil_compression</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>coil_compression_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>coil_compression_num_modesKept</name>
+            <value>-1</value>
+        </property>
+
+        <!-- parameters for coil map estimation 
+            enum ISMRMRDCOILMAPALGO
+            {
+                ISMRMRD_SOUHEIL,
+                ISMRMRD_SOUHEIL_ITER
+            };
+        -->
+        <property>
+            <name>coil_map_algorithm</name>
+            <value>ISMRMRD_SOUHEIL</value>
+        </property>
+        <property>
+            <name>csm_kSize</name>
+            <value>7</value>
+        </property>
+
+        <property>
+            <name>csm_powermethod_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_true_3D</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>csm_iter_num</name>
+            <value>3</value>
+        </property>
+
+        <property>
+            <name>csm_iter_thres</name>
+            <value>0.001</value>
+        </property>
+
+        <property>
+            <name>csm_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- algorithm -->
+        <property>
+            <name>recon_algorithm</name>
+            <value>ISMRMRD_SPIRIT</value>
+        </property>
+
+        <property>
+            <name>recon_kspace_needed</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>recon_auto_parameters</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_GRAPPA -->
+        <property>
+            <name>grappa_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E1</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_kSize_E2</name>
+            <value>4</value>
+        </property>
+        <property>
+            <name>grappa_reg_lamda</name>
+            <value>0.0005</value>
+        </property>
+        <property>
+            <name>grappa_calib_over_determine_ratio</name>
+            <value>45</value>
+        </property>
+        <property>
+            <name>grappa_use_gpu</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_SPIRIT -->
+        <property>
+            <name>spirit_kSize_RO</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E1</name>
+            <value>7</value>
+        </property>
+        <property>
+            <name>spirit_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>spirit_reg_lamda</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>spirit_use_gpu</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_calib_over_determine_ratio</name>
+            <value>15</value>
+        </property>
+        <property>
+            <name>spirit_solve_symmetric</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_iter_max</name>
+            <value>100</value>
+        </property>
+        <property>
+            <name>spirit_iter_thres</name>
+            <value>0.0025</value>
+        </property>
+        <property>
+            <name>spirit_print_iter</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for ISMRMRD_L1SPIRIT -->
+        <property>
+            <name>spirit_perform_linear</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_perform_nonlinear</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_parallel_imaging_lamda</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_image_reg_lamda</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_data_fidelity_lamda</name>
+            <value>0</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_max</name>
+            <value>10</value>
+        </property>
+        <property>
+            <name>spirit_ncg_iter_thres</name>
+            <value>0.001</value>
+        </property>
+        <property>
+            <name>spirit_ncg_print_iter</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_coil_sen_map</name>
+            <value>true</value>
+        </property>
+        <property>
+            <name>spirit_use_moco_enhancement</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_recon_moco_images</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_temporal_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_2D_scale_per_chunk</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>spirit_RO_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E1_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_E2_enhancement_ratio</name>
+            <value>1.0</value>
+        </property>
+        <property>
+            <name>spirit_3D_scale_per_chunk</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for scaling and image sending -->
+        <property>
+            <name>min_intensity_value</name>
+            <value>64</value>
+        </property>
+
+        <property>
+            <name>max_intensity_value</name>
+            <value>4095</value>
+        </property>
+
+        <property>
+            <name>scalingFactor</name>
+            <value>-1.0</value>
+        </property>
+
+        <property>
+            <name>use_constant_scalingFactor</name>
+            <value>false</value>
+        </property>
+
+        <!-- parameters for kspace filter, image data -->
+        <property>
+            <name>filterRO</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterRO_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE1</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE1_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterE2</name>
+            <value>ISMRMRD_FILTER_GAUSSIAN</value>
+        </property>
+        <property>
+            <name>filterE2_sigma</name>
+            <value>0.5</value>
+        </property>
+        <property>
+            <name>filterE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, ref data -->
+        <property>
+            <name>filterRefRO</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefRO_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE1</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE1_width</name>
+            <value>0.15</value>
+        </property>
+
+        <property>
+            <name>filterRefE2</name>
+            <value>ISMRMRD_FILTER_HANNING</value>
+        </property>
+        <property>
+            <name>filterRefE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterRefE2_width</name>
+            <value>0.15</value>
+        </property>
+
+        <!-- parameters for kspace filter, partial fourier/asymmetric echo filter -->
+        <property>
+            <name>filterPartialFourierRO</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierRO_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE1</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE1_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>filterPartialFourierE2</name>
+            <value>ISMRMRD_FILTER_TAPERED_HANNING</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_sigma</name>
+            <value>1.5</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_width</name>
+            <value>0.15</value>
+        </property>
+        <property>
+            <name>filterPartialFourierE2_densityComp</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for partial fourier handling algorithm, ISMRMRD_PF_POCS, ISMRMRD_PF_FENGHUANG, ISMRMRD_PF_ZEROFILLING_FILTER, ISMRMRD_PF_ZEROFILLING -->
+        <property>
+            <name>partialFourier_algo</name>
+            <value>ISMRMRD_PF_POCS</value>
+        </property>
+
+        <!-- parameters for partial fourier POCS algorithm -->
+        <property>
+            <name>partialFourier_POCS_iters</name>
+            <value>6</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_thres</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_POCS_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for partial fourier FengHuang algorithm -->
+        <property>
+            <name>partialFourier_FengHuang_kSize_RO</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E1</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_kSize_E2</name>
+            <value>5</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_thresReg</name>
+            <value>0.01</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_sameKernel_allN</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand</name>
+            <value>24</value>
+        </property>
+        <property>
+            <name>partialFourier_FengHuang_transitBand_E2</name>
+            <value>24</value>
+        </property>
+
+        <!-- parameters for debug and timing -->
+        <property>
+            <name>debugFolder</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>debugFolder2</name>
+            <value></value>
+        </property>
+
+        <property>
+            <name>cloudNodeFile</name>
+            <value>myCloud_3DT.txt</value>
+        </property>
+
+        <property>
+            <name>performTiming</name>
+            <value>true</value>
+        </property>
+
+        <property>
+            <name>verboseMode</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for system acquisition -->
+        <property>
+            <name>timeStampResolution</name>
+            <value>0.0025</value>
+        </property>
+
+        <!-- parameters for recon job split -->
+        <property>
+            <name>job_split_by_S</name>
+            <value>false</value>
+        </property>
+        <property>
+            <name>job_num_of_N</name>
+            <value>48</value>
+        </property>
+        <property>
+            <name>job_max_Megabytes</name>
+            <value>2499</value>
+        </property>
+        <property>
+            <name>job_overlap</name>
+            <value>2</value>
+        </property>
+        <property>
+            <name>job_perform_on_control_node</name>
+            <value>true</value>
+        </property>
+
+        <!-- parameters for the cloud computation 
+             The cloud should be defined as the following: CloudNodeX_IP/Port/XMLConfiguration etc.
+        -->
+        <property>
+            <name>CloudComputing</name>
+            <value>false</value>
+        </property>
+
+        <property>
+            <name>CloudSize</name>
+            <value>1</value>
+        </property>
+
+        <!-- node 0 -->
+        <property>
+            <name>CloudNode0_IP</name>
+            <value>localhost</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_Port</name>
+            <value>9003</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_XMLConfiguration</name>
+            <value>GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml</value>
+        </property>
+
+        <property>
+            <name>CloudNode0_ComputingPowerIndex</name>
+            <value>1</value>
+        </property>
+
+    </gadget>
+
+    <!-- after recon processing -->
+    <gadget>
+        <name>FloatToShort</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>FloatToUShortGadget</classname>
+    </gadget>
+
+    <!--
+    <gadget>
+      <name>ImageFinishCPLX</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetCPLX</classname>
+    </gadget>
+    -->
+
+    <!--
+    <gadget>
+      <name>ImageFinishFLOAT</name>
+      <dll>gadgetron_mricore</dll>
+      <classname>ImageFinishGadgetFLOAT</classname>
+    </gadget>
+    -->
+
+    <gadget>
+        <name>ImageFinishUSHORT</name>
+        <dll>gadgetron_mricore</dll>
+        <classname>ImageFinishGadgetUSHORT</classname>
+    </gadget>
+
+</gadgetronStreamConfiguration>
diff --git a/toolboxes/gtplus/config/gtCloud/myCloud_2DT.txt b/toolboxes/gtplus/config/gtCloud/myCloud_2DT.txt
new file mode 100644
index 0000000..b11be4a
--- /dev/null
+++ b/toolboxes/gtplus/config/gtCloud/myCloud_2DT.txt
@@ -0,0 +1,8 @@
+localhost
+9002
+1
+localhost
+9003
+GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml
+1
+0
diff --git a/toolboxes/gtplus/config/gtCloud/myCloud_2DT_DualLayer.txt b/toolboxes/gtplus/config/gtCloud/myCloud_2DT_DualLayer.txt
new file mode 100644
index 0000000..69be64c
--- /dev/null
+++ b/toolboxes/gtplus/config/gtCloud/myCloud_2DT_DualLayer.txt
@@ -0,0 +1,8 @@
+localhost
+9002
+1
+localhost
+9003
+GadgetronProgram_gtPlus_2DT_Cartesian_FirstLayer_CloudNode.xml
+1
+0
diff --git a/toolboxes/gtplus/config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt b/toolboxes/gtplus/config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt
new file mode 100644
index 0000000..789fa76
--- /dev/null
+++ b/toolboxes/gtplus/config/gtCloud/myCloud_2DT_DualLayer_FirstLayer.txt
@@ -0,0 +1,8 @@
+localhost
+9003
+1
+localhost
+9004
+GadgetronProgram_gtPlus_2DT_Cartesian_CloudNode.xml
+1
+0
diff --git a/toolboxes/gtplus/config/gtCloud/myCloud_3DT.txt b/toolboxes/gtplus/config/gtCloud/myCloud_3DT.txt
new file mode 100644
index 0000000..2bfe3f1
--- /dev/null
+++ b/toolboxes/gtplus/config/gtCloud/myCloud_3DT.txt
@@ -0,0 +1,12 @@
+localhost
+9002
+2
+localhost
+9003
+GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml
+1
+localhost
+9004
+GadgetronProgram_gtPlus_3DT_Cartesian_CloudNode.xml
+1
+0
diff --git a/toolboxes/gtplus/matlab/CMakeLists.txt b/toolboxes/gtplus/matlab/CMakeLists.txt
new file mode 100644
index 0000000..1553c7f
--- /dev/null
+++ b/toolboxes/gtplus/matlab/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+set( FILEs  
+    FtkMatlabConverterBase.h
+    FtkMatlabConverter.h 
+    FtkMatlabConverterComplex.h 
+    FtkMatlabMacros.h 
+    FtkMatlabEngineBase.h 
+    FtkMatlabEngineBase.cpp 
+    FtkMexExport.h 
+    mexFtk.h )
diff --git a/toolboxes/gtplus/matlab/FtkMatlabConverterBase.h b/toolboxes/gtplus/matlab/FtkMatlabConverterBase.h
new file mode 100644
index 0000000..c03c09f
--- /dev/null
+++ b/toolboxes/gtplus/matlab/FtkMatlabConverterBase.h
@@ -0,0 +1,569 @@
+/**
+*  @file    FtkMatlabConverterBase.h
+*  @brief   Ftk and Matlab converter base
+*  @author  Hui Xue
+*  @date    July 18, 2011
+*  @Site    SCR, Princeton
+*
+*  Copyright (C) Siemens Corporate Research, Inc. 2011 All Rights Reserved
+**/
+
+#ifdef FTK_MATLAB_SUPPORT
+
+#ifndef FTK_FTKMATLABCONVERTERBASE_H
+#define FTK_FTKMATLABCONVERTERBASE_H
+
+#include <interface/matlab/FtkMexExport.h>
+
+#include <vector>
+#include <typeinfo>
+#include <core/basic/Clock.h>
+#include <core/basic/Common.h>
+#include <core/basic/Exception.h>
+#include <core/math/MathMacros.h>
+#include <core/basic/Allocate.h>
+#include <core/basic/Array1d.h> 
+#include <core/basic/RealMatrix.h> 
+#include <core/image/Image.h>
+#include "core/image/container/ImageContainerArray.h"
+#include "core/image/container/ImageContainerMatrix.h"
+#include "core/image/container/ImageContainerAllocation.h"
+
+BEGIN_NAMESPACE_1(ftk)
+
+struct ImageInfo
+{
+    int sizeX;
+    int sizeY;
+    int sizeZ;
+    int sizeT;
+    int sizeN;
+    int sizeM;
+
+    double spacingX;
+    double spacingY;
+    double spacingZ;
+    double spacingT;
+    double spacingN;
+    double spacingM;
+
+    double positionPatient[3];
+    double orientationPatient[3][3];
+
+    /// number of fields
+    FTK_STATIC_CONST( numOfFields, IndexType, 20 );
+
+    ImageInfo() 
+    {
+        initialize();
+    }
+
+    ImageInfo(const ImageBase<2>& aImage) 
+    {
+        initialize();
+
+        sizeX = aImage.getSize(0);
+        sizeY = aImage.getSize(1);
+
+        spacingX = aImage.getPixelSpacing(0);
+        spacingY = aImage.getPixelSpacing(1);
+
+        positionPatient[0] = aImage.getPosition(0);
+        positionPatient[1] = aImage.getPosition(1);
+        positionPatient[2] = aImage.getPosition(2);
+
+        orientationPatient[0][0] = aImage.getOrient3D(0, 0);
+        orientationPatient[0][1] = aImage.getOrient3D(0, 1);
+        orientationPatient[0][2] = aImage.getOrient3D(0, 2);
+
+        orientationPatient[1][0] = aImage.getOrient3D(1, 0);
+        orientationPatient[1][1] = aImage.getOrient3D(1, 1);
+        orientationPatient[1][2] = aImage.getOrient3D(1, 2);
+
+        orientationPatient[2][0] = aImage.getOrient3D(2, 0);
+        orientationPatient[2][1] = aImage.getOrient3D(2, 1);
+        orientationPatient[2][2] = aImage.getOrient3D(2, 2);
+    }
+
+    ImageInfo(const ImageBase<3>& aImage) 
+    {
+        initialize();
+
+        sizeX = aImage.getSize(0);
+        sizeY = aImage.getSize(1);
+        sizeZ = aImage.getSize(2);
+
+        spacingX = aImage.getPixelSpacing(0);
+        spacingY = aImage.getPixelSpacing(1);
+        spacingZ = aImage.getPixelSpacing(2);
+
+        positionPatient[0] = aImage.getPosition(0);
+        positionPatient[1] = aImage.getPosition(1);
+        positionPatient[2] = aImage.getPosition(2);
+
+        orientationPatient[0][0] = aImage.getOrient3D(0, 0);
+        orientationPatient[0][1] = aImage.getOrient3D(0, 1);
+        orientationPatient[0][2] = aImage.getOrient3D(0, 2);
+
+        orientationPatient[1][0] = aImage.getOrient3D(1, 0);
+        orientationPatient[1][1] = aImage.getOrient3D(1, 1);
+        orientationPatient[1][2] = aImage.getOrient3D(1, 2);
+
+        orientationPatient[2][0] = aImage.getOrient3D(2, 0);
+        orientationPatient[2][1] = aImage.getOrient3D(2, 1);
+        orientationPatient[2][2] = aImage.getOrient3D(2, 2);
+    }
+
+    ImageInfo(const ImageBase<4>& aImage) 
+    {
+        initialize();
+
+        sizeX = aImage.getSize(0);
+        sizeY = aImage.getSize(1);
+        sizeZ = aImage.getSize(2);
+        sizeT = aImage.getSize(3);
+
+        spacingX = aImage.getPixelSpacing(0);
+        spacingY = aImage.getPixelSpacing(1);
+        spacingZ = aImage.getPixelSpacing(2);
+        spacingT = aImage.getPixelSpacing(3);
+
+        positionPatient[0] = aImage.getPosition(0);
+        positionPatient[1] = aImage.getPosition(1);
+        positionPatient[2] = aImage.getPosition(2);
+
+        orientationPatient[0][0] = aImage.getOrient3D(0, 0);
+        orientationPatient[0][1] = aImage.getOrient3D(0, 1);
+        orientationPatient[0][2] = aImage.getOrient3D(0, 2);
+
+        orientationPatient[1][0] = aImage.getOrient3D(1, 0);
+        orientationPatient[1][1] = aImage.getOrient3D(1, 1);
+        orientationPatient[1][2] = aImage.getOrient3D(1, 2);
+
+        orientationPatient[2][0] = aImage.getOrient3D(2, 0);
+        orientationPatient[2][1] = aImage.getOrient3D(2, 1);
+        orientationPatient[2][2] = aImage.getOrient3D(2, 2);
+    }
+
+    ImageInfo(const ImageBase<5>& aImage) 
+    {
+        initialize();
+
+        sizeX = aImage.getSize(0);
+        sizeY = aImage.getSize(1);
+        sizeZ = aImage.getSize(2);
+        sizeT = aImage.getSize(3);
+        sizeN = aImage.getSize(4);
+
+        spacingX = aImage.getPixelSpacing(0);
+        spacingY = aImage.getPixelSpacing(1);
+        spacingZ = aImage.getPixelSpacing(2);
+        spacingT = aImage.getPixelSpacing(3);
+        spacingN = aImage.getPixelSpacing(4);
+
+        positionPatient[0] = aImage.getPosition(0);
+        positionPatient[1] = aImage.getPosition(1);
+        positionPatient[2] = aImage.getPosition(2);
+
+        orientationPatient[0][0] = aImage.getOrient3D(0, 0);
+        orientationPatient[0][1] = aImage.getOrient3D(0, 1);
+        orientationPatient[0][2] = aImage.getOrient3D(0, 2);
+
+        orientationPatient[1][0] = aImage.getOrient3D(1, 0);
+        orientationPatient[1][1] = aImage.getOrient3D(1, 1);
+        orientationPatient[1][2] = aImage.getOrient3D(1, 2);
+
+        orientationPatient[2][0] = aImage.getOrient3D(2, 0);
+        orientationPatient[2][1] = aImage.getOrient3D(2, 1);
+        orientationPatient[2][2] = aImage.getOrient3D(2, 2);
+    }
+
+    ImageInfo(const ImageBase<6>& aImage) 
+    {
+        initialize();
+
+        sizeX = aImage.getSize(0);
+        sizeY = aImage.getSize(1);
+        sizeZ = aImage.getSize(2);
+        sizeT = aImage.getSize(3);
+        sizeN = aImage.getSize(4);
+        sizeM = aImage.getSize(5);
+
+        spacingX = aImage.getPixelSpacing(0);
+        spacingY = aImage.getPixelSpacing(1);
+        spacingZ = aImage.getPixelSpacing(2);
+        spacingT = aImage.getPixelSpacing(3);
+        spacingN = aImage.getPixelSpacing(4);
+        spacingM = aImage.getPixelSpacing(5);
+
+        positionPatient[0] = aImage.getPosition(0);
+        positionPatient[1] = aImage.getPosition(1);
+        positionPatient[2] = aImage.getPosition(2);
+
+        orientationPatient[0][0] = aImage.getOrient3D(0, 0);
+        orientationPatient[0][1] = aImage.getOrient3D(0, 1);
+        orientationPatient[0][2] = aImage.getOrient3D(0, 2);
+
+        orientationPatient[1][0] = aImage.getOrient3D(1, 0);
+        orientationPatient[1][1] = aImage.getOrient3D(1, 1);
+        orientationPatient[1][2] = aImage.getOrient3D(1, 2);
+
+        orientationPatient[2][0] = aImage.getOrient3D(2, 0);
+        orientationPatient[2][1] = aImage.getOrient3D(2, 1);
+        orientationPatient[2][2] = aImage.getOrient3D(2, 2);
+    }
+
+    ~ImageInfo() {}
+
+    void initialize()
+    {
+        sizeX = 1;
+        sizeY = 1;
+        sizeZ = 1;
+        sizeT = 1;
+        sizeN = 1;
+        sizeM = 1;
+
+        spacingX = 1.0;
+        spacingY = 1.0;
+        spacingZ = 1.0;
+        spacingT = 1.0;
+        spacingN = 1.0;
+        spacingM = 1.0;
+
+        positionPatient[0] = 0.0;
+        positionPatient[1] = 0.0;
+        positionPatient[2] = 0.0;
+
+        orientationPatient[0][0] = 1.0;
+        orientationPatient[0][1] = 0.0;
+        orientationPatient[0][2] = 0.0;
+
+        orientationPatient[1][0] = 0.0;
+        orientationPatient[1][1] = 1.0;
+        orientationPatient[1][2] = 0.0;
+
+        orientationPatient[2][0] = 0.0;
+        orientationPatient[2][1] = 0.0;
+        orientationPatient[2][2] = 1.0;
+
+        int ind = 0;
+        fieldnames.resize(numOfFields);
+        fieldnames[ind++] = "sizeX";
+        fieldnames[ind++] = "sizeY";
+        fieldnames[ind++] = "sizeZ";
+        fieldnames[ind++] = "sizeT";
+        fieldnames[ind++] = "sizeN";
+        fieldnames[ind++] = "sizeM";
+        fieldnames[ind++] = "spacingX";
+        fieldnames[ind++] = "spacingY";
+        fieldnames[ind++] = "spacingZ";
+        fieldnames[ind++] = "spacingT";
+        fieldnames[ind++] = "spacingN";
+        fieldnames[ind++] = "spacingM";
+        fieldnames[ind++] = "positionPatient";
+        fieldnames[ind++] = "orientationPatient";
+        fieldnames[ind++] = "xsize";
+        fieldnames[ind++] = "ysize";
+        fieldnames[ind++] = "zsize";
+        fieldnames[ind++] = "xvoxelsize";
+        fieldnames[ind++] = "yvoxelsize";
+        fieldnames[ind++] = "zvoxelsize";
+    }
+
+    mxArray* convertToMatlab() const 
+    {
+        try
+        {
+            mwSize num[2] = {1, 1};
+            mxArray* info = mxCreateStructArray(2, num, numOfFields, const_cast<const char**>(&fieldnames[0]));
+
+            int ind = 0;
+
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeX));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeY));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeZ));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeT));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeN));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeM));
+
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingX));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingY));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingZ));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingT));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingN));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingM));
+
+            mxArray* mxPositionPatient = mxCreateDoubleMatrix(1, 3, mxREAL);
+            double* pPositionData = mxGetPr(mxPositionPatient);
+            pPositionData[0] = positionPatient[0];
+            pPositionData[1] = positionPatient[1];
+            pPositionData[2] = positionPatient[2];
+            mxSetField(info, 0, fieldnames[ind++], mxPositionPatient);
+
+            mxArray* mxOrientationPatient = mxCreateDoubleMatrix(3, 3, mxREAL);
+            double* pOrientationData = mxGetPr(mxOrientationPatient);
+            pOrientationData[0] = orientationPatient[0][0];
+            pOrientationData[1] = orientationPatient[1][0];
+            pOrientationData[2] = orientationPatient[2][0];
+            pOrientationData[3] = orientationPatient[0][1];
+            pOrientationData[4] = orientationPatient[1][1];
+            pOrientationData[5] = orientationPatient[2][1];
+            pOrientationData[6] = orientationPatient[0][2];
+            pOrientationData[7] = orientationPatient[1][2];
+            pOrientationData[8] = orientationPatient[2][2];
+            mxSetField(info, 0, fieldnames[ind++], mxOrientationPatient);
+
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeX));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeY));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(sizeZ));
+
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingX));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingY));
+            mxSetField(info, 0, fieldnames[ind++], mxCreateDoubleScalar(spacingZ));
+
+            return info;
+        }
+        catch(...)
+        {
+            mexErrMsgTxt("Exceptions happened in ImageInfo::convertToMatlab() ... ");
+            throw;
+        }
+
+        return NULL;
+    }
+
+    bool convertFromMatlab(const mxArray* info)
+    {
+        try
+        {
+            int ind = 0;
+            sizeX = static_cast<int>(mxGetScalar(mxGetField(info, 0, fieldnames[ind++])));
+            sizeY = static_cast<int>(mxGetScalar(mxGetField(info, 0, fieldnames[ind++])));
+            sizeZ = static_cast<int>(mxGetScalar(mxGetField(info, 0, fieldnames[ind++])));
+            sizeT = static_cast<int>(mxGetScalar(mxGetField(info, 0, fieldnames[ind++])));
+            sizeN = static_cast<int>(mxGetScalar(mxGetField(info, 0, fieldnames[ind++])));
+            sizeM = static_cast<int>(mxGetScalar(mxGetField(info, 0, fieldnames[ind++])));
+
+            spacingX = mxGetScalar(mxGetField(info, 0, fieldnames[ind++]));
+            spacingY = mxGetScalar(mxGetField(info, 0, fieldnames[ind++]));
+            spacingZ = mxGetScalar(mxGetField(info, 0, fieldnames[ind++]));
+            spacingT = mxGetScalar(mxGetField(info, 0, fieldnames[ind++]));
+            spacingN = mxGetScalar(mxGetField(info, 0, fieldnames[ind++]));
+            spacingM = mxGetScalar(mxGetField(info, 0, fieldnames[ind++]));
+
+            mxArray* mxPositionPatient = mxGetField(info, 0, fieldnames[ind++]);
+            double* pPositionData = mxGetPr(mxPositionPatient);
+            positionPatient[0] = pPositionData[0];
+            positionPatient[1] = pPositionData[1];
+            positionPatient[2] = pPositionData[2];
+            
+            mxArray* mxOrientationPatient = mxGetField(info, 0, fieldnames[ind++]);
+            double* pOrientationData = mxGetPr(mxOrientationPatient);
+            orientationPatient[0][0] = pOrientationData[0];
+            orientationPatient[1][0] = pOrientationData[1];
+            orientationPatient[2][0] = pOrientationData[2];
+            orientationPatient[0][1] = pOrientationData[3];
+            orientationPatient[1][1] = pOrientationData[4];
+            orientationPatient[2][1] = pOrientationData[5];
+            orientationPatient[0][2] = pOrientationData[6];
+            orientationPatient[1][2] = pOrientationData[7];
+            orientationPatient[2][2] = pOrientationData[8];
+        }
+        catch(...)
+        {
+            mexErrMsgTxt("Exceptions happened in ImageInfo::convertFromMatlab() ... ");
+            return false;
+        }
+
+        return true;
+    }
+
+    Size<IndexType, 2> getSize2D() const { return Size<IndexType, 2>(sizeX, sizeY); }
+    Spacing<2> getSpacing2D() const { return Spacing<2>(spacingX, spacingY); }
+
+    Size<IndexType, 3> getSize3D() const { return Size<IndexType, 3>(sizeX, sizeY, sizeZ); }
+    Spacing<3> getSpacing3D() const { return Spacing<3>(spacingX, spacingY, spacingZ); }
+
+    Size<IndexType, 4> getSize4D() const 
+    { 
+        Size<IndexType, 4> aSize; 
+        aSize[0] = sizeX;
+        aSize[1] = sizeY;
+        aSize[2] = sizeZ;
+        aSize[3] = sizeT;
+        return aSize;
+    }
+
+    Size<IndexType, 5> getSize5D() const 
+    { 
+        Size<IndexType, 5> aSize; 
+        aSize[0] = sizeX;
+        aSize[1] = sizeY;
+        aSize[2] = sizeZ;
+        aSize[3] = sizeT;
+        aSize[4] = sizeN;
+        return aSize;
+    }
+
+    Size<IndexType, 6> getSize6D() const 
+    { 
+        Size<IndexType, 6> aSize; 
+        aSize[0] = sizeX;
+        aSize[1] = sizeY;
+        aSize[2] = sizeZ;
+        aSize[3] = sizeT;
+        aSize[4] = sizeN;
+        aSize[5] = sizeM;
+        return aSize;
+    }
+
+    Spacing<4> getSpacing4D() const 
+    { 
+        Spacing<4> spacing;
+        spacing[0] = spacingX;
+        spacing[1] = spacingY;
+        spacing[2] = spacingZ;
+        spacing[3] = spacingT;
+
+        return spacing; 
+    }
+
+    Spacing<5> getSpacing5D() const 
+    { 
+        Spacing<5> spacing;
+        spacing[0] = spacingX;
+        spacing[1] = spacingY;
+        spacing[2] = spacingZ;
+        spacing[3] = spacingT;
+        spacing[4] = spacingN;
+
+        return spacing; 
+    }
+
+    Spacing<6> getSpacing6D() const 
+    { 
+        Spacing<6> spacing;
+        spacing[0] = spacingX;
+        spacing[1] = spacingY;
+        spacing[2] = spacingZ;
+        spacing[3] = spacingT;
+        spacing[4] = spacingN;
+        spacing[5] = spacingM;
+
+        return spacing; 
+    }
+
+    Point3d<double> getPosition() const { return Point3d<double>(positionPatient[0], positionPatient[1], positionPatient[2]); }
+    Point3d<double> getOrient3D(int i) const { return Point3d<double>(orientationPatient[i][0], orientationPatient[i][1], orientationPatient[i][2]); }
+
+protected: 
+
+    std::vector<char*> fieldnames;
+};
+
+template <typename ValueType> 
+class FtkMatlabConverterBase : public Object
+{
+public:
+    
+    /** @name Typedefs */
+    //@{
+    /// 2D Image 
+    typedef Image<ValueType, 2> Image2DType;
+    /// 3D Image 
+    typedef Image<ValueType, 3> Image3DType;
+    /// 4D Image 
+    typedef Image<ValueType, 4> Image4DType;
+    /// 5D Image 
+    typedef Image<ValueType, 5> Image5DType;
+    /// 6D Image 
+    typedef Image<ValueType, 6> Image6DType;
+    /// Image array 
+    typedef ImageContainerArray<Image2DType> ImageContainerArrayType;
+    /// Image matrix
+    typedef ImageContainerMatrix<ImageContainerArrayType> ImageContainerMatrixType;
+    /// std vector type
+    typedef std::vector<ValueType> StdVectorType;
+    /// ftk vector type
+    typedef Array1d<ValueType> FtkVectorType;
+    /// ftk matrix type
+    typedef RealMatrix<ValueType> FtkMatrixType;
+    //@}
+
+    /** @name Constructors and destructor */
+    //@{
+    FtkMatlabConverterBase() {}
+    virtual ~FtkMatlabConverterBase() {}
+    //@}
+
+    /** @name functions to convert ftk to/from Matlab */
+    //@{
+    // 2D image
+    virtual bool convertToMatlab(const Image2DType& aImage, mxArray*& aMxImage, mxArray*& aHeader) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxImage, const mxArray* aHeader, Image2DType& aImage) = 0;
+    // 3D image
+    virtual bool convertToMatlab(const Image3DType& aImage, mxArray*& aMxImage, mxArray*& aHeader) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxImage, const mxArray* aHeader, Image3DType& aImage) = 0;
+    // image array
+    virtual bool convertToMatlab(const ImageContainerArrayType& aImageArray, mxArray*& aMxImage, mxArray*& aHeader) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxImage, const mxArray* aHeader, ImageContainerArrayType& aImage) = 0;
+    // image matrix
+    virtual bool convertToMatlab(const ImageContainerMatrixType& aImageMatrix, mxArray*& aMxImage, mxArray*& aHeader) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxImage, const mxArray* aHeader, ImageContainerMatrixType& aImageMatrix) = 0;
+    // ftk vector
+    virtual bool convertToMatlab(const FtkVectorType& vec, mxArray*& aMxArray) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxArray, FtkVectorType& vec) = 0;
+    // ftk matrix
+    virtual bool convertToMatlab(const FtkMatrixType& vec, mxArray*& aMxArray) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxArray, FtkMatrixType& vec) = 0;
+
+    // std vector
+    virtual bool convertToMatlab(const StdVectorType& vec, mxArray*& aMxArray) = 0;
+    virtual bool convertFromMatlab(const mxArray* aMxArray, StdVectorType& vec) = 0;
+    // std string
+    virtual bool convertToMatlab(const std::string& str, mxArray*& aMxStr);
+    virtual bool convertFromMatlab(const mxArray* aMxStr, std::string& str);
+    //@}
+
+    virtual void print(std::ostream& os) const = 0;
+
+protected:
+    
+};
+
+// -------------------------------------------------------
+// std string
+// -------------------------------------------------------
+
+template <typename ValueType> 
+bool FtkMatlabConverterBase<ValueType>::
+convertToMatlab(const std::string& str, mxArray*& aMxStr)
+{
+    aMxStr = mxCreateString(str.c_str());
+    return (aMxStr != NULL);
+}
+
+template <typename ValueType> 
+bool FtkMatlabConverterBase<ValueType>::
+convertFromMatlab(const mxArray* aMxStr, std::string& str)
+{
+    FTK_CHECK_RETURN_FALSE(aMxStr!=NULL);
+
+    int buflen = mxGetNumberOfElements(aMxStr) + 1;
+
+    std::vector<char> buf(buflen, '\0');
+
+    if (mxGetString(aMxStr, &buf[0], buflen) != 0)
+        return false;
+
+    str = std::string(&buf[0]);
+
+    return true;
+}
+
+END_NAMESPACE_1(ftk)
+
+#endif // FTK_FTKMATLABCONVERTERBASE_H 
+
+#endif // FTK_MATLAB_SUPPORT 
diff --git a/toolboxes/gtplus/matlab/gtMatlab.h b/toolboxes/gtplus/matlab/gtMatlab.h
new file mode 100644
index 0000000..fa1b65f
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlab.h
@@ -0,0 +1,37 @@
+/********************************************************************
+    created:    2013/10/03
+    created:    3:10:2013   16:15
+    author:     Hui Xue
+
+    purpose:    Header to enable matlab print out info
+*********************************************************************/
+
+#pragma once 
+
+#include <strstream>
+
+#ifdef GADGET_MSG
+    #undef GADGET_MSG
+#endif // GADGET_MSG
+
+#ifdef GADGET_ERROR_MSG
+    #undef GADGET_ERROR_MSG
+#endif // GADGET_ERROR_MSG
+
+#ifdef GADGET_WARN_MSG
+    #undef GADGET_WARN_MSG
+#endif // GADGET_WARN_MSG
+
+#ifdef _DEBUG
+    #define GADGET_MSG(message) { std::ostrstream outs; outs << " (" << __FILE__ << ", " << __LINE__ << "): " << message << std::endl << '\0'; mexPrintf("%s", outs.str()); }
+#else
+    #define GADGET_MSG(message) { std::ostrstream outs; outs << message << std::endl << '\0'; mexPrintf("%s", outs.str()); }
+#endif // _DEBUG
+
+#ifdef _DEBUG
+    #define GADGET_WARN_MSG(message) { std::ostrstream outs; outs << " (" << __FILE__ << ", " << __LINE__ << "): " << message << std::endl << '\0'; mexWarnMsgTxt(outs.str()); }
+#else
+    #define GADGET_WARN_MSG(message) { std::ostrstream outs; outs << message << std::endl << '\0'; mexWarnMsgTxt(outs.str()); }
+#endif // _DEBUG
+
+#define GADGET_ERROR_MSG(message) GADGET_MSG(message) 
diff --git a/toolboxes/gtplus/matlab/gtMatlabConverter.h b/toolboxes/gtplus/matlab/gtMatlabConverter.h
new file mode 100644
index 0000000..3006369
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlabConverter.h
@@ -0,0 +1,235 @@
+/********************************************************************
+    created:    2013/10/03
+    created:    3:10:2013   14:06
+    author:     Hui Xue
+
+    purpose:    Gadgetron data structure to matlab conversion
+*********************************************************************/
+
+#pragma once
+
+#include <matrix.h>
+#include <mat.h>
+#include <mexGT.h>
+#include <cmath>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <strstream>
+
+#include "hoNDArray.h"
+#include "gtMatlab.h"
+
+namespace Gadgetron
+{
+
+template <typename T> 
+class gtMatlabConverter
+{
+public:
+
+    typedef gtMatlabConverter<T> Self;
+
+    gtMatlabConverter() {}
+    virtual ~gtMatlabConverter() {}
+
+    virtual bool hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx);
+    virtual bool Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a);
+
+    virtual bool Vec2Matlab(const std::vector<T>& vec, mxArray*& aMx);
+    virtual bool Matlab2Vec(const mxArray* aMx, std::vector<T>& vec);
+
+    virtual bool Str2Matlab(const std::string& str, mxArray*& aMx);
+    virtual bool Matlab2Str(const mxArray* aMx, std::string& str);
+
+    virtual void printInfo(std::ostream& os) const;
+};
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = a.get_dimensions();
+
+        int ndim = dim->size();
+        mwSize* dims = new mwSize[ndim];
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dims[ii] = static_cast<mwSize>( (*dim)[ii] );
+        }
+
+        size_t N = a.get_number_of_elements();
+        const T* pA = a.begin();
+
+        if ( typeid(T) == typeid(float) )
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxREAL);
+            float* ptr = static_cast<float*>(mxGetData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                ptr[ii] = pA[ii];
+            }
+        }
+        else
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxDOUBLE_CLASS, mxREAL);
+            double* ptr = static_cast<double*>(mxGetData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                ptr[ii] = pA[ii];
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a)
+{
+    try
+    {
+        mwSize ndim = mxGetNumberOfDimensions(aMx);
+        const mwSize* dims = mxGetDimensions(aMx);
+
+        std::vector<size_t> dim(ndim);
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dim[ii] = static_cast<size_t>(dims[ii]);
+        }
+
+        a.create(&dim);
+        size_t N = a.get_number_of_elements();
+        T* pA = a.begin();
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* ptr = static_cast<float*>(mxGetData(aMx));
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+        else
+        {
+            double* ptr = static_cast<double*>(mxGetData(aMx));
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Vec2Matlab(const std::vector<T>& vec, mxArray*& aMx)
+{
+    try
+    {
+        aMx = mxCreateNumericMatrix(vec.size(), 1, mxDOUBLE_CLASS, mxREAL);
+        double* ptr = static_cast<double*>(mxGetData(aMx));
+        for ( size_t ii=0; ii<vec.size(); ii++ )
+        {
+            ptr[ii] = static_cast<double>(vec[ii]);
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::Vec2Matlab(const std::vector<T>& vec, mxArray*& aMx) ... ");
+        return false;
+    }
+
+    return true; 
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Matlab2Vec(const mxArray* aMx, std::vector<T>& vec)
+{
+    try
+    {
+        mwSize N = mxGetNumberOfElements(aMx);
+        vec.resize(N);
+
+        if ( mxIsSingle(aMx) )
+        {
+            float* ptr = static_cast<float*>(mxGetData(aMx));
+            for ( size_t ii=0; ii<N; ii++ )
+            {
+                vec[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+        else
+        {
+            double* ptr = static_cast<double*>(mxGetData(aMx));
+            for ( size_t ii=0; ii<N; ii++ )
+            {
+                vec[ii] = static_cast<T>(ptr[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverter::Matlab2Vec(const mxArray* aMx, std::vector<T>& vec) ... ");
+        return false;
+    }
+
+    return true; 
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Str2Matlab(const std::string& str, mxArray*& aMx)
+{
+    aMx = mxCreateString(str.c_str());
+    return (aMx != NULL);
+}
+
+template <typename T> 
+bool gtMatlabConverter<T>::
+Matlab2Str(const mxArray* aMx, std::string& str)
+{
+    int N = mxGetNumberOfElements(aMx) + 1;
+
+    std::vector<char> buf(N, '\0');
+    if (mxGetString(aMx, &buf[0], N) != 0)
+    {
+        return false;
+    }
+    str = std::string(&buf[0]);
+
+    return true;
+}
+
+template <typename T> 
+void gtMatlabConverter<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "--------------------------------------------------" << endl;
+    os << "Gadgetron matlab Converter ..." << endl;
+    os << "--------------------------------------------------" << endl;
+}
+
+}
diff --git a/toolboxes/gtplus/matlab/gtMatlabConverterComplex.h b/toolboxes/gtplus/matlab/gtMatlabConverterComplex.h
new file mode 100644
index 0000000..e9c404b
--- /dev/null
+++ b/toolboxes/gtplus/matlab/gtMatlabConverterComplex.h
@@ -0,0 +1,155 @@
+/********************************************************************
+    created:    2013/10/03
+    created:    3:10:2013   14:06
+    author:     Hui Xue
+
+    purpose:    Gadgetron complex data structure to matlab conversion
+*********************************************************************/
+
+#pragma once
+
+#include <matrix.h>
+#include <mat.h>
+#include <mexGT.h>
+#include <cmath>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <strstream>
+
+#include "hoNDArray.h"
+
+namespace Gadgetron
+{
+
+template <typename T> 
+class gtMatlabConverterComplex
+{
+public:
+
+    gtMatlabConverterComplex() {}
+    virtual ~gtMatlabConverterComplex() {}
+
+    virtual bool hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx);
+    virtual bool Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a);
+
+    virtual void printInfo(std::ostream& os) const;
+
+protected:
+};
+
+template <typename T> 
+bool gtMatlabConverterComplex<T>::
+hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dim = a.get_dimensions();
+
+        int ndim = dim->size();
+        mwSize* dims = new mwSize[ndim];
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dims[ii] = static_cast<mwSize>( (*dim)[ii] );
+        }
+
+        size_t N = a.get_number_of_elements();
+        const T* pA = a.begin();
+
+        if ( typeid(T) == typeid(std::complex<float>) )
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxCOMPLEX);
+            float* pr = static_cast<float*>(mxGetData(aMx));
+            float* pi = static_cast<float*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pr[ii] = static_cast<float>(pA[ii].real());
+                pi[ii] = static_cast<float>(pA[ii].imag());
+            }
+        }
+        else if ( typeid(T) == typeid(std::complex<double>) )
+        {
+            aMx = mxCreateNumericArray(ndim, dims, mxDOUBLE_CLASS, mxCOMPLEX);
+            double* pr = static_cast<double*>(mxGetData(aMx));
+            double* pi = static_cast<double*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pr[ii] = static_cast<double>(pA[ii].real());
+                pi[ii] = static_cast<double>(pA[ii].imag());
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverterComplex::hoNDArray2Matlab(const hoNDArray<T>& a, mxArray*& aMx) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtMatlabConverterComplex<T>::
+Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a)
+{
+    try
+    {
+        mwSize ndim = mxGetNumberOfDimensions(aMx);
+        const mwSize* dims = mxGetDimensions(aMx);
+
+        std::vector<size_t> dim(ndim);
+
+        size_t ii;
+        for ( ii=0; ii<ndim; ii++ )
+        {
+            dim[ii] = static_cast<size_t>(dims[ii]);
+        }
+
+        a.create(&dim);
+        size_t N = a.get_number_of_elements();
+        T* pA = a.begin();
+
+        if ( mxIsComplex(aMx) && mxIsDouble(aMx) )
+        {
+            double* pr = static_cast<double*>(mxGetData(aMx));
+            double* pi = static_cast<double*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = T(pr[ii], pi[ii]);
+            }
+        }
+        else if ( mxIsComplex(aMx) && mxIsSingle(aMx) )
+        {
+            float* pr = static_cast<float*>(mxGetData(aMx));
+            float* pi = static_cast<float*>(mxGetImagData(aMx));
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                pA[ii] = T(pr[ii], pi[ii]);
+            }
+        }
+    }
+    catch(...)
+    {
+        mexErrMsgTxt("Errors happened in gtMatlabConverterComplex::Matlab2hoNDArray(const mxArray* aMx, hoNDArray<T>& a) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtMatlabConverterComplex<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "--------------------------------------------------" << endl;
+    os << "Gadgetron matlab Converter for complex arrays ..." << endl;
+    os << "--------------------------------------------------" << endl;
+}
+
+}
diff --git a/toolboxes/gtplus/solver/gtPlusLSQRSolver.h b/toolboxes/gtplus/solver/gtPlusLSQRSolver.h
new file mode 100644
index 0000000..56b1fd8
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusLSQRSolver.h
@@ -0,0 +1,294 @@
+/** \file       gtPlusLSQRSolver.h
+    \brief      Implement the LSQR linear solver for Ax=b
+    \author     Hui Xue
+
+    Ref to:
+    http://www.stanford.edu/group/SOL/software/lsqr.html
+    C. C. Paige and M. A. Saunders, LSQR: An algorithm for sparse linear equations and sparse least squares, TOMS 8(1), 43-71 (1982). 
+    C. C. Paige and M. A. Saunders, Algorithm 583; LSQR: Sparse linear equations and least-squares problems, TOMS 8(2), 195-209 (1982).
+*/
+
+#pragma once
+
+#include "gtPlusLinearSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusLSQRSolver : public gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>
+{
+public:
+
+    typedef gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type> BaseClass;
+
+    typedef typename BaseClass::ValueType ValueType;
+
+    typedef typename realType<ValueType>::Type value_type;
+
+    gtPlusLSQRSolver();
+    virtual ~gtPlusLSQRSolver();
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x);
+
+    virtual void printInfo(std::ostream& os) const;
+
+    using BaseClass::iterMax_;
+    using BaseClass::thres_;
+    using BaseClass::x0_;
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    using BaseClass::callback_;
+    using BaseClass::oper_;
+};
+
+// ===================================================================================== //
+//                           Implementation of template function                         //
+// ===================================================================================== //
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusLSQRSolver() : BaseClass()
+{
+    iterMax_ = 70;
+    thres_ = 1e-4;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusLSQRSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+solve(const Array_Type_I& b, Array_Type_O& x)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(oper_!=NULL);
+
+        x = *x0_;
+
+        // Set up for the method
+        value_type n2b;
+        Gadgetron::norm2(b, n2b);
+
+        int flag = 1;
+
+        value_type tolb = thres_ * n2b;
+        Array_Type_I u(b);
+
+        // u = u - A(x, varargin{:}, 'notransp');
+        // u = b - A*x0
+        GADGET_CHECK_RETURN_FALSE(oper_->forwardOperator(x, u));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(b, u, u));
+
+        value_type beta;
+        Gadgetron::norm2(u, beta);
+
+        value_type normr(beta);
+        if (std::abs(beta)>0)
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/beta, u));
+        }
+
+        double c = 1;
+        double s = 0;
+        value_type phibar = beta;
+
+        // v = A(u, varargin{:},'transp');
+        Array_Type_I v(b);
+        GADGET_CHECK_RETURN_FALSE(oper_->adjointOperator(u, v));
+
+        value_type alpha;
+        Gadgetron::norm2(v, alpha);
+        if (std::abs(alpha)>0)
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/alpha, v));
+        }
+
+        Array_Type_I d(b);
+        Gadgetron::clear(d);
+
+        value_type normar;
+        normar = alpha * beta;
+
+        // Check for all zero solution
+        if ( std::abs(normar) < DBL_EPSILON )
+        {
+            Gadgetron::clear(x);
+            return true;
+        }
+
+        value_type norma(0);
+        value_type sumnormd2 = 0;
+        size_t stag = 0;
+        size_t iter = iterMax_;
+        size_t  maxstagsteps = 3;
+
+        // loop over maxit iterations (unless convergence or failure)
+
+        Array_Type_I z(v), dtmp(d), ztmp(d), vt(v);
+        Array_Type_I normaVec(3);
+
+        value_type thet, rhot, rho, phi, tmp, tmp2;
+
+        size_t ii;
+        for ( ii=0; ii<iterMax_; ii++ )
+        {
+            // z = v;
+            memcpy(z.begin(), v.begin(), v.get_number_of_bytes());
+
+            // u = A(z, varargin{:},'notransp') - alpha*u;
+            GADGET_CHECK_RETURN_FALSE(oper_->forwardOperator(z, dtmp));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( alpha, u));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract( dtmp, u, u));
+
+            Gadgetron::norm2(u, beta);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/beta, u));
+
+            normaVec(0) = norma;
+            normaVec(1) = alpha;
+            normaVec(2) = beta;
+            Gadgetron::norm2(normaVec, norma);
+
+            thet = - s * alpha;
+            rhot = c * alpha;
+            rho = (value_type)( std::sqrt( (double)(rhot*rhot + beta*beta) ));
+            c = rhot / rho;
+            s = - beta / rho;
+            phi = c * phibar;
+            if ( std::abs(phi)< DBL_EPSILON )
+            {
+                stag = 1;
+            }
+
+            phibar = s * phibar;
+
+            // d = (z - thet * d) / rho;
+            //dtmp = d;
+            memcpy(dtmp.begin(), d.begin(), d.get_number_of_bytes());
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( thet, dtmp));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract( z, dtmp, ztmp));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/rho, ztmp));
+            //d = ztmp;
+            memcpy(d.begin(), ztmp.begin(), d.get_number_of_bytes());
+
+            // sumnormd2 = sumnormd2 + (norm(d(:)))^2;
+            Gadgetron::norm2(d, tmp);
+            sumnormd2 += (tmp*tmp);
+
+            // Check for stagnation of the method
+            Gadgetron::norm2(x, tmp2);
+
+            if ( std::abs(phi)*std::abs(tmp) < DBL_EPSILON*std::abs(tmp2) )
+            {
+                stag++;
+            }
+            else
+            {
+                stag = 0;
+            }
+
+            // check for convergence in min{|b-A*x|}
+            if ( std::abs(normar/(norma*normr)) <= thres_ )
+            {
+                flag = 0;
+                break;
+            }
+
+            // check for convergence in A*x=b
+            if (std::abs(normr) <= std::abs(tolb) )
+            {
+                flag = 0;
+                break;
+            }
+
+            if (stag >= maxstagsteps)
+            {
+                flag = 3;
+                break;
+            }
+
+            //if (printIter_)
+            //{
+            //    GADGET_MSG("Iteration " << ii << " - normar/(norma*normr) = " << std::abs(normar/(norma*normr)) << " - normr = " << std::abs(normr) );
+            //}
+
+            // x = x + phi * d;
+            //dtmp = d;
+            memcpy(dtmp.begin(), d.begin(), d.get_number_of_bytes());
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( phi, dtmp));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::add( x, dtmp, x));
+
+            normr = std::abs( (double)s) * normr;
+
+            // vt = A(u, varargin{:},'transp');
+            GADGET_CHECK_RETURN_FALSE(oper_->adjointOperator(u, vt));
+
+            // v = vt - beta * v;
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( beta, v));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract( vt, v, v));
+
+            Gadgetron::norm2(v, alpha);
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( value_type(1.0)/alpha, v));
+
+            normar = alpha * std::abs( (value_type)s * phi);
+        }
+
+        if (printIter_)
+        {
+            GADGET_MSG("Total iteration number is  " << ii << " - relative norm is " << std::abs(normar/(norma*normr)) << " ... ");
+        }
+
+        if (flag == 1)
+        {
+            if ( normar/(norma*normr) <= thres_ )
+            {
+                flag = 0;
+            }
+
+            if (std::abs(normr) <= std::abs(tolb) )
+            {
+                flag = 0;
+            }
+        }
+
+        //if (printIter_)
+        //{
+        //    value_type relres = normr/n2b;
+        //    GADGET_MSG("Flag = " << flag << " - relres = " << std::abs(relres) );
+        //}
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::solve(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusLSQRSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    os << "-------------- GTPlus ISMRMRD linear LSQR solver -------------" << std::endl;
+    os << "The linear solver solves Ax=b problem" << std::endl;
+    os << "------------------------------------------------------------" << std::endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusLinearSolver.h b/toolboxes/gtplus/solver/gtPlusLinearSolver.h
new file mode 100644
index 0000000..00cb5c3
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusLinearSolver.h
@@ -0,0 +1,93 @@
+/** \file       gtPlusLinearSolver.h
+    \brief      Define the base class for linear solver
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusLinearSolver : public gtPlusSolver<Array_Type_I, Array_Type_O>
+{
+public:
+
+    typedef gtPlusSolver<Array_Type_I, Array_Type_O> BaseClass;
+
+    typedef typename BaseClass::ValueType ValueType;
+
+    gtPlusLinearSolver();
+    virtual ~gtPlusLinearSolver();
+
+    Oper_Type* get();
+    void set(Oper_Type& op);
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x) = 0;
+
+    virtual void printInfo(std::ostream& os) const;
+
+    // number of max iterations
+    size_t iterMax_;
+
+    // threshold for detla change of residual
+    double thres_;
+
+    // initial guess for the solver
+    Array_Type_O* x0_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    using BaseClass::callback_;
+    Oper_Type* oper_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusLinearSolver() : oper_(NULL)
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusLinearSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+Oper_Type* gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::get()
+{
+    return oper_;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::set(Oper_Type& op)
+{
+    oper_ = &op;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD linear solver --------------------" << endl;
+    os << "Linear solver for GtPlus ISMRMRD package ... " << endl;
+    os << "----------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusNCGSolver.h b/toolboxes/gtplus/solver/gtPlusNCGSolver.h
new file mode 100644
index 0000000..4055bf6
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusNCGSolver.h
@@ -0,0 +1,380 @@
+/** \file       gtPlusNCGSolver.h
+    \brief      Implement the non-linear conjugate gradient solver for scalar function minimization problem
+                The function to be optmized is required to supply the gradient computation
+
+                The Secant line search is used with the non-linear CG solver.
+
+    \author     Hui Xue
+
+    Ref to:
+    http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method
+    http://en.wikipedia.org/wiki/Secant_method
+*/
+
+#pragma once
+
+#include "gtPlusNonLinearSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusNCGSolver : public gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>
+{
+public:
+
+    typedef gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type> BaseClass;
+    typedef typename BaseClass::ValueType ValueType;
+    typedef typename realType<ValueType>::Type value_type;
+    typedef typename BaseClass::Oper_Elem_Type Oper_Elem_Type;
+    typedef typename BaseClass::Oper_List_Type Oper_List_Type;
+
+    gtPlusNCGSolver();
+    virtual ~gtPlusNCGSolver();
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x);
+
+    virtual bool grad(const Array_Type_I& x, Array_Type_I& grad);
+    virtual bool obj(const Array_Type_I& x, ValueType& obj);
+
+    virtual void printInfo(std::ostream& os) const;
+
+    /// number of max iterations
+    size_t iterMax_;
+
+    /// threshold for detla change of gradient
+    double gradThres_;
+
+    /// threshold for detla change of objective function
+    double objThres_;
+
+    /// scale factor of initial step size of linear search 
+    double beta_;
+
+    /// initial step size of linear search
+    double t0_;
+
+    /// number of max linear search iterations (secant linear search)
+    size_t secantIterMax_;
+
+    /// gradient threshold for secant linear search
+    double secantThres_;
+
+    /// sometimes the secantThres can increase during line search
+    /// the maximal allowed secantThres increments compared to previous secant iteration
+    double secantRatio_;
+
+    /// initial guess for the solver
+    Array_Type_O* x0_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    using BaseClass::callback_;
+    using BaseClass::operList_;
+
+    Array_Type_I gradBuf_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusNCGSolver() : BaseClass()
+{
+    iterMax_ = 10;
+    gradThres_ = 1e-4;
+    objThres_ = 0.1;
+    beta_ = 0.5;
+    t0_ = 2.0;
+    secantIterMax_ = 10;
+    secantThres_ = 1e-3;
+    secantRatio_ = 2;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusNCGSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+grad(const Array_Type_I& x, Array_Type_I& g)
+{
+    try
+    {
+        g.create(x.get_dimensions());
+
+        size_t N = operList_.size();
+        if ( N == 0 ) return true;
+
+        GADGET_CHECK_RETURN_FALSE(operList_[0].first->grad(x, g));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(operList_[0].second, g));
+
+        for ( size_t op=1; op<N; op++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(operList_[op].first->grad(x, gradBuf_));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(operList_[op].second, gradBuf_));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::add(gradBuf_, g, g));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::grad(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+obj(const Array_Type_I& x, ValueType& ob)
+{
+    try
+    {
+        size_t N = operList_.size();
+        if ( N == 0 )
+        {
+            ob = 0;
+            return true;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(operList_[0].first->obj(x, ob));
+        ob *= operList_[0].second;
+
+        ValueType v = 0;
+        for ( size_t op=1; op<N; op++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(operList_[op].first->obj(x, v));
+            ob += operList_[op].second * v;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::obj(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+solve(const Array_Type_I& /*b*/, Array_Type_O& x)
+{
+    try
+    {
+        if ( operList_.empty() ) return true;
+
+        value_type v;
+
+        // initial gradient
+        Array_Type_I g0(*x0_);
+        GADGET_CHECK_RETURN_FALSE(this->grad(*x0_, g0));
+
+        //Gadgetron::norm2(*x0_, v); GADGET_MSG(v);
+        //Gadgetron::norm2(g0, v); GADGET_MSG(v);
+
+        // dx = -g0;
+        Array_Type_I dx(g0);
+        GADGET_CHECK_RETURN_FALSE( Gadgetron::scal( (value_type)(-1), dx ) );
+
+        //Gadgetron::norm2(dx, v); GADGET_MSG(v);
+
+        // initialize x
+        x = *x0_;
+
+        // secant parameters
+        value_type bk, dxNorm, t0(t0_);
+        ValueType oriF, prevF, currF, deltaD, thresValue, prevThresValue, phiPrev(0), v1, v2, v3, phi, alpha(0);
+        Array_Type_I g1(g0), gTmp(g0), sx(*x0_), xTmp(*x0_), dxTmp(dx), prevX(*x0_);
+        size_t nIter(0);
+
+        // guess the t0_
+        this->obj(x, oriF);
+
+        dxTmp = dx;
+        Gadgetron::scal(t0, dxTmp);
+        Gadgetron::add(x, dxTmp, xTmp);
+
+        this->obj(xTmp, currF);
+
+        if (printIter_)
+        {
+            GADGET_MSG("To determine t0, --- ori and curr obj: " << oriF << " - " << currF << " ... ");
+        }
+
+        unsigned int numOfTries = 0;
+        while ( (std::abs(currF.real() - oriF.real())/currF.real() < 0.05) && (numOfTries < 3) )
+        {
+            numOfTries++;
+
+            t0 /= beta_;
+
+            dxTmp = dx;
+            Gadgetron::scal(t0, dxTmp);
+            Gadgetron::add(x, dxTmp, xTmp);
+
+            this->obj(xTmp, currF);
+
+            GADGET_MSG("t0 is " << t0 << " ... ");
+            GADGET_MSG("To determine t0, --- ori and curr obj: " << oriF << " - " << currF << " ... ");
+        }
+
+        prevF = oriF;
+        while (1)
+        {
+            // secant line-search
+            // wGradient(x+t0*dx);
+            dxTmp = dx;
+            Gadgetron::scal(t0, dxTmp);
+            Gadgetron::add(x, dxTmp, xTmp);
+
+            //Gadgetron::norm2(xTmp, v); GADGET_MSG(v);
+
+            this->grad(xTmp, gTmp);
+
+            //Gadgetron::norm2(gTmp, v); GADGET_MSG(v);
+
+            // phiPrev = gTmp(:)'*dx(:);
+            Gadgetron::dotc(gTmp, dx, phiPrev);
+            alpha = -t0;
+            Gadgetron::dotc(dx, dx, deltaD);
+
+            thresValue = std::conj(alpha)*alpha*deltaD;
+            prevThresValue = thresValue;
+
+            size_t lsiter = 0;
+            sx = x;
+
+            while ( (lsiter<secantIterMax_) 
+                && (thresValue.real()>secantThres_) 
+                && (thresValue.real()<=secantRatio_*prevThresValue.real()) )
+            {
+                if ( lsiter == 0 )
+                {
+                    gTmp = g0;
+                }
+                else
+                {
+                    this->grad(sx, gTmp);
+                }
+
+                Gadgetron::dotc(gTmp, dx, phi);
+                // alpha = alpha * (phi.real()/(phiPrev.real()-phi.real()));
+                alpha = alpha * phi/(phiPrev-phi);
+                phiPrev = phi;
+                lsiter = lsiter+1;
+                prevThresValue = std::abs(thresValue);
+                thresValue = std::conj(alpha)*alpha*deltaD;
+
+                if ( thresValue.real() <= secantRatio_*prevThresValue.real() )
+                {
+                    dxTmp = dx;
+                    Gadgetron::scal(alpha, dxTmp);
+                    Gadgetron::add(sx, dxTmp, sx);
+                }
+            }
+
+            // control the number of line searches by adapting the initial step search
+            if (lsiter>2)
+            {
+                t0 *= beta_;
+            }
+
+            if (lsiter<1)
+            {
+                t0 /= beta_;
+            }
+
+            prevX = x;
+            x = sx;
+
+            this->obj(x, currF);
+
+            // conjugate gradient calculation
+            this->grad(x, g1);
+
+            // Fletcher - Reeves updates
+            Gadgetron::dotc(g1, g1, v1);
+            Gadgetron::dotc(g0, g0, v2);
+            bk = v1.real()/(v2.real()+DBL_EPSILON);
+
+            g0 = g1;
+
+            // dx =  - g1 + bk.* dx;
+            dxTmp = dx;
+            Gadgetron::scal(bk, dxTmp);
+            Gadgetron::subtract(dxTmp, g1, dx);
+
+            if (printIter_)
+            {
+                GADGET_MSG("Iteration " << nIter << " --- prev and curr obj: " << prevF << " - " << currF << " - line search: " << lsiter);
+            }
+
+            // perform call back
+            if ( callback_ != NULL )
+            {
+                Gadgetron::norm2(dx, dxNorm);
+                if ( (nIter>iterMax_) || (dxNorm<gradThres_) || (callback_->exit() && (prevF.real()-currF.real()<objThres_)) )
+                {
+                    if ( prevF.real() < currF.real() )
+                    {
+                        x = prevX;
+                    }
+                    break;
+                }
+
+                GADGET_CHECK_RETURN_FALSE(callback_->callBack(nIter, x));
+                GADGET_MSG("exit is " << callback_->exit());
+
+                nIter = nIter + 1;
+            }
+            else
+            {
+                nIter = nIter + 1;
+
+                Gadgetron::norm2(dx, dxNorm);
+                if ( (nIter>iterMax_) || (dxNorm<gradThres_) || (prevF.real()-currF.real()<objThres_) )
+                {
+                    if ( prevF.real() < currF.real() )
+                    {
+                        x = prevX;
+                    }
+                    break;
+                }
+            }
+
+            prevF = currF;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::solve(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNCGSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD ncg solver -------------" << endl;
+    os << "The non-linear cg solver " << std::endl;
+    os << "------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusNonLinearSolver.h b/toolboxes/gtplus/solver/gtPlusNonLinearSolver.h
new file mode 100644
index 0000000..ba31d33
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusNonLinearSolver.h
@@ -0,0 +1,122 @@
+/** \file       gtPlusNonLinearSolver.h
+    \brief      Define the base class for GtPlus non-linear solvers
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusSolver.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+class gtPlusNonLinearSolver : public gtPlusSolver<Array_Type_I, Array_Type_O>
+{
+public:
+
+    typedef gtPlusSolver<Array_Type_I, Array_Type_O> BaseClass;
+
+    typedef typename BaseClass::ValueType ValueType;
+
+    // one operator is related to a weight
+    typedef std::pair<Oper_Type*, ValueType> Oper_Elem_Type;
+    // multiple operator can be added to a solver
+    typedef std::vector< Oper_Elem_Type > Oper_List_Type;
+
+    gtPlusNonLinearSolver();
+    virtual ~gtPlusNonLinearSolver();
+
+    Oper_List_Type getOperList();
+    void setOperList(Oper_List_Type& opero);
+
+    void add(Oper_Type& op, ValueType a);
+    void remove(Oper_Type*& op, ValueType& a);
+
+    bool set(size_t ind, Oper_Type& op, ValueType a);
+
+    // main function to perform the solver
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x) = 0;
+
+    virtual void printInfo(std::ostream& os) const;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::printIter_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_util_complex_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    using BaseClass::callback_;
+    Oper_List_Type operList_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+gtPlusNonLinearSolver() : BaseClass()
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+~gtPlusNonLinearSolver() 
+{
+    operList_.clear();
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+add(Oper_Type& op, ValueType a)
+{
+    operList_.push_back(Oper_Elem_Type(&op, a));
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+remove(Oper_Type*& op, ValueType& a)
+{
+    if ( operList_.empty() )
+    {
+        op = NULL;
+        a = 0;
+    }
+    else
+    {
+        op = operList_[operList_.size()-1].first;
+        a = operList_[operList_.size()-1].second;
+        operList_.pop_back();
+    }
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+bool gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+set(size_t ind, Oper_Type& op, ValueType a)
+{
+    if ( ind >= operList_.size() )
+    {
+        GADGET_WARN_MSG("ind >= operList_.size()");
+    }
+
+    operList_[ind].first = &op;
+    operList_[ind].second = a;
+
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O, typename Oper_Type>
+void gtPlusNonLinearSolver<Array_Type_I, Array_Type_O, Oper_Type>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD nonlinear solver -----------------" << endl;
+    os << "ISMRMRD nonlinear solver ... " << endl;
+    os << "----------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/solver/gtPlusSolver.h b/toolboxes/gtplus/solver/gtPlusSolver.h
new file mode 100644
index 0000000..d4d9e79
--- /dev/null
+++ b/toolboxes/gtplus/solver/gtPlusSolver.h
@@ -0,0 +1,150 @@
+/** \file       gtPlusSolver.h
+    \brief      Define the base class for GtPlus solvers
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusMemoryManager.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename Array_Type_I, typename Array_Type_O>
+class gtPlusSolverCallBack
+{
+public:
+
+    gtPlusSolverCallBack();
+    virtual ~gtPlusSolverCallBack();
+
+    // if true, current solver will exit
+    virtual bool exit();
+
+    virtual bool callBack(size_t iter, Array_Type_O& x);
+
+    virtual void printInfo(std::ostream& os) const;
+
+    bool exit_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O>
+class gtPlusSolver
+{
+public:
+
+    typedef gtPlusSolver<Array_Type_I, Array_Type_O> Self;
+
+    typedef typename Array_Type_I::value_type ValueType;
+
+    typedef gtPlusSolverCallBack<Array_Type_I, Array_Type_O> CBType;
+
+    gtPlusSolver();
+    virtual ~gtPlusSolver();
+
+    CBType* getCallBack();
+    void setCallBack(CBType* pCB);
+
+    virtual bool solve(const Array_Type_I& b, Array_Type_O& x) = 0;
+
+    virtual void printInfo(std::ostream& os) const;
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    bool printIter_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<ValueType> gtPlus_util_;
+    gtPlusISMRMRDReconUtilComplex<ValueType> gtPlus_util_complex_;
+
+    // memory manager
+    boost::shared_ptr<gtPlusMemoryManager> gtPlus_mem_manager_;
+
+protected:
+
+    CBType* callback_;
+};
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+gtPlusSolverCallBack() : exit_(true)
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+~gtPlusSolverCallBack() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+bool gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+exit()
+{
+    return exit_;
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+bool gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+callBack(size_t /*iter*/, Array_Type_O& /*x*/) 
+{
+    return true;
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+void gtPlusSolverCallBack<Array_Type_I, Array_Type_O>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD solver callback ------------------" << endl;
+    os << "A callback scheme for ISMRMRD solvers ... " << endl;
+    os << "----------------------------------------------------------------" << endl;
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolver<Array_Type_I, Array_Type_O>::
+gtPlusSolver() : callback_(NULL), performTiming_(false), printIter_(false)
+{
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+gtPlusSolver<Array_Type_I, Array_Type_O>::
+~gtPlusSolver() 
+{
+
+}
+
+template <typename Array_Type_I, typename Array_Type_O>
+void gtPlusSolver<Array_Type_I, Array_Type_O>::
+printInfo(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD solver ------------------" << endl;
+    os << "GtPlus ISMRMRD solvers ... " << endl;
+    os << "-------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/ut/CMakeLists.txt b/toolboxes/gtplus/ut/CMakeLists.txt
new file mode 100644
index 0000000..44319d1
--- /dev/null
+++ b/toolboxes/gtplus/ut/CMakeLists.txt
@@ -0,0 +1,74 @@
+ENABLE_TESTING()
+
+if(WIN32)
+    link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories( ${GTEST_INCLUDE_DIRS} 
+                     ${CMAKE_SOURCE_DIR}/gadgets/core 
+                     ${ACE_INCLUDE_DIR} 
+                     ${Boost_INCLUDE_DIR}
+                     ${FFTW3_INCLUDE_DIR}
+                     ${ISMRMRD_INCLUDE_DIR}
+                     ${ISMRMRD_XSD_INCLUDE_DIR}
+                     ${XSD_INCLUDE_DIR}
+                     ${XERCESC_INCLUDE_DIR}
+                     ${CMAKE_SOURCE_DIR}/dependencies/tinyxml
+                     ${CMAKE_SOURCE_DIR}/toolboxes/mri/pmri/gpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+                     ${CMAKE_SOURCE_DIR}/toolboxes/operators
+                     ${CMAKE_SOURCE_DIR}/toolboxes/operators/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+                     ${CMAKE_SOURCE_DIR}/toolboxes/solvers/cpu
+                     ${CMAKE_SOURCE_DIR}/gadgets/core
+                     ${CMAKE_SOURCE_DIR}/apps/gadgetron 
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+                     ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/hostutils
+                     ${CMAKE_SOURCE_DIR}/toolboxes/gadgettools )
+
+if (MKL_FOUND)
+    MESSAGE("MKL Found for gtPlus ... ")
+    ADD_DEFINITIONS(-DUSE_MKL)
+    list(APPEND EXTRA_MKL_LIBRARIES mkl_core)
+    if ( USE_OPENMP )
+        list(APPEND EXTRA_MKL_LIBRARIES mkl_intel_thread)
+    endif ( USE_OPENMP )
+
+    INCLUDE_DIRECTORIES( ${MKL_INCLUDE_DIR} )
+    LINK_DIRECTORIES( ${MKL_LIB_DIR} ${MKL_COMPILER_LIB_DIR} )
+endif (MKL_FOUND)
+
+link_libraries(optimized ${ACE_LIBRARIES} debug ${ACE_DEBUG_LIBRARY} 
+                ${GTEST_LIBRARIES} 
+                ${Boost_LIBRARIES} 
+                ${ISMRMRD_LIBRARIES} 
+                ${MKL_LIBRARIES} 
+                cpucore 
+                cpucore_math 
+                gtplus 
+                gadgettools 
+                gadgetronPlus)
+
+if (CUDA_FOUND)
+    link_libraries(gtplus gpuparallelmri gpucore)
+endif (CUDA_FOUND)
+
+add_executable(gtplus_ut_util 
+    gtplus_ut.cpp 
+    util_test.cpp )
+
+add_executable(gtplus_ut_grappa 
+    gtplus_ut.cpp 
+    grappa_test.cpp )
+
+add_executable(gtplus_ut_spirit 
+    gtplus_ut.cpp 
+    spirit_test.cpp )
+
+#add_test(gtplus_ut gtplus_ut_util)
diff --git a/toolboxes/gtplus/ut/grappa_test.cpp b/toolboxes/gtplus/ut/grappa_test.cpp
new file mode 100644
index 0000000..65a61a1
--- /dev/null
+++ b/toolboxes/gtplus/ut/grappa_test.cpp
@@ -0,0 +1,613 @@
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif // USE_OMP
+
+#include "Gadget.h"
+#include "ismrmrd.h"
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+
+#include "hoNDArray_utils.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+// #include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusMemoryManager.h"
+#include "hoNDArrayMemoryManaged.h"
+#include "gtPlusSPIRIT2DOperator.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRIT3DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace3DOperator.h"
+#include "gtPlusNCGSolver.h"
+
+#include "GadgetronTimer.h"
+
+#include <boost/thread/mutex.hpp>
+
+#ifdef max
+#undef max
+#endif // max
+
+using namespace Gadgetron;
+using namespace Gadgetron::gtPlus;
+using testing::Types;
+
+template <typename T> class gtPlus_grappa_Test : public ::testing::Test 
+{
+protected:
+    virtual void SetUp()
+    {
+        GADGET_MSG("=============================================================================================");
+        gtPluse_ut_folder_ = std::string(::getenv("GTPLUS_UNITTEST_DIRECTORY"));
+        GADGET_MSG("=============================================================================================");
+        GADGET_MSG("Unit Test for GtPlus");
+        gtPluse_ut_data_folder_ = gtPluse_ut_folder_ + "/data/";
+        gtPluse_ut_res_folder_ = gtPluse_ut_folder_ + "/result/";
+        GADGET_MSG("gtPluse_ut_data_folder_ is " << gtPluse_ut_data_folder_);
+        GADGET_MSG("gtPluse_ut_res_folder_ is " << gtPluse_ut_res_folder_);
+
+        timer_.set_timing_in_destruction(false);
+
+#ifdef WIN32
+    #ifdef USE_OMP
+        /// lock the threads
+        #pragma omp parallel default(shared)
+        {
+            int tid = omp_get_thread_num();
+            // std::cout << tid << std::endl;
+            DWORD_PTR mask = (1 << tid);
+            SetThreadAffinityMask( GetCurrentThread(), mask );
+        }
+    #endif // USE_OMP
+#endif // WIN32
+    }
+
+    std::string gtPluse_ut_folder_;
+    std::string gtPluse_ut_data_folder_;
+    std::string gtPluse_ut_res_folder_;
+
+    gtPlusIOAnalyze gt_io_;
+    gtPlusISMRMRDReconUtil<T> util_;
+    gtPlusISMRMRDReconUtilComplex<T> utilCplx_;
+    GadgetronTimer timer_;
+};
+
+typedef Types<float, double> realImplementations;
+
+typedef Types< std::complex<float> > cpfloatImplementations;
+
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+typedef Types<std::complex<float>, std::complex<double> > stdCplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(gtPlus_grappa_Test, cpfloatImplementations);
+
+TYPED_TEST(gtPlus_grappa_Test, reconWorker2DTGRAPPA_SNRUnit)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<GT_Complex8> data;
+    gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "StdandardDataR2_Kspace_real", 
+        this->gtPluse_ut_data_folder_ + "StdandardDataR2_Kspace_imag");
+    data.print(std::cout);
+
+    unsigned long long RO = data.get_size(0);
+    unsigned long long E1 = data.get_size(1);
+    unsigned long long CHA = data.get_size(2);
+    unsigned long long PHS = data.get_size(3);
+
+    unsigned long long reconE1 = 144;
+
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    unsigned long long SLC = 1;
+    unsigned long long E2 = 1;
+    unsigned long long CON = 1;
+    unsigned long long REP = 1;
+    unsigned long long SET = 1;
+    unsigned long long SEG = 1;
+
+    hoNDArray<GT_Complex8> kspace(RO, E1, CHA, SLC, E2, CON, PHS);
+    memcpy(kspace.begin(), data.begin(), data.get_number_of_bytes());
+
+    Gadgetron::norm2(kspace, v);
+    GADGET_MSG("kspace = " << v);
+
+    // ref
+    hoNDArray<T> refTmp;
+    gt_io.importArrayComplex(refTmp, this->gtPluse_ut_data_folder_ + "StdandardDataR2_Ref_real", 
+        this->gtPluse_ut_data_folder_ + "StdandardDataR2_Ref_imag");
+
+    hoNDArray<T> ref(refTmp.get_size(0), refTmp.get_size(1), refTmp.get_size(2), SLC, E2, CON, PHS);
+    memcpy(ref.begin(), refTmp.begin(), refTmp.get_number_of_bytes());
+    ref.print(std::cout);
+
+    // noise
+    hoNDArray<T> noise;
+    gt_io.importArrayComplex(noise, this->gtPluse_ut_data_folder_ + "StdandardDataR2_Noise_real", 
+        this->gtPluse_ut_data_folder_ + "StdandardDataR2_Noise_imag");
+    noise.print(std::cout);
+
+    // call the recon
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef std::pair<Gadgetron::gtPlus::ISMRMRDDIM, unsigned long long> DimensionRecordType;
+
+    WorkOrderType* workOrder = new WorkOrderType;
+
+    boost::shared_ptr< std::vector<size_t> > dims = kspace.get_dimensions();
+
+    GADGET_MSG("[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << 1 << " " << 1 << " " << 1 << "]");
+
+    std::vector<size_t> dimensions_ = *dims;
+
+        // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+
+    // parameters
+    Gadgetron::gtPlus::ISMRMRDDIM dim_4th_ = DIM_Phase;
+    Gadgetron::gtPlus::ISMRMRDDIM dim_5th_ = DIM_Slice;
+    Gadgetron::gtPlus::ISMRMRDDIM workOrder_ShareDim_ = DIM_NONE;
+
+    bool interleaved_same_combinationcoeff_allS_ = false;
+    int interleaved_whichS_combinationcoeff_ = 0;
+
+    bool embedded_averageall_ref_ = false;
+    bool embedded_fullres_coilmap_ = true;
+    bool embedded_same_combinationcoeff_allS_ = false;
+    int embedded_whichS_combinationcoeff_ = 0;
+    bool embedded_ref_fillback_ = true;
+
+    bool separate_averageall_ref_ = true;
+    bool separate_fullres_coilmap_ = false;
+    bool separate_same_combinationcoeff_allS_ = false;
+    int separate_whichS_combinationcoeff_ = 0;
+
+    bool same_coil_compression_coeff_allS_ = true;
+    bool downstream_coil_compression_ = true;
+    double coil_compression_thres_ = 1e-3;
+    int coil_compression_num_modesKept_ = -1;
+
+    unsigned long long csm_kSize_ = 7;
+    unsigned long long csm_powermethod_num_ = 3;
+
+    Gadgetron::gtPlus::ISMRMRDALGO recon_algorithm_ = ISMRMRD_GRAPPA;
+    bool recon_kspace_needed_ = true;
+
+    unsigned long long grappa_kSize_RO_ = 5;
+    unsigned long long grappa_kSize_E1_ = 4;
+    unsigned long long grappa_kSize_E2_ = 4;
+    double grappa_reg_lamda_ = 1e-4;
+
+    // recon
+    workflow_.setDataArray(kspace);
+    workflow_.setRefArray(ref);
+    workflow_.noise_ = &noise;
+
+    workflow_.noiseBW_ = 130;
+    workflow_.overSamplingRatioRO_ = 2.0;
+    workflow_.ADCSamplingTimeinSecond_ = 7800/1e9;
+
+    // for this ut data, the oversampling removal and noise prewhitening on ref are not needed
+    workflow_.ref_remove_oversampling_RO_ = false;
+    workflow_.ref_apply_noisePreWhitening_ = false;
+
+    workflow_.reconSizeRO_ = RO/2;
+    workflow_.reconSizeE1_ = reconE1;
+    workflow_.reconSizeE2_ = 1;
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = dim_4th_;
+    workflow_.dim5th_ = dim_5th_;
+
+    workOrder->CalibMode_ = ISMRMRD_separate;
+    workOrder->acceFactorE1_ = 2;
+    workOrder->acceFactorE2_ = 1;
+
+    workOrder->downstream_coil_compression_ = downstream_coil_compression_;
+    workOrder->coil_compression_thres_ = coil_compression_thres_;
+    workOrder->coil_compression_num_modesKept_ = coil_compression_num_modesKept_;
+    workOrder->csm_kSize_ = csm_kSize_;
+    workOrder->csm_powermethod_num_ = csm_powermethod_num_;
+    workOrder->grappa_kSize_RO_ = grappa_kSize_RO_;
+    workOrder->grappa_kSize_E1_ = grappa_kSize_E1_;
+    workOrder->grappa_kSize_E2_ = grappa_kSize_E2_;
+    workOrder->grappa_reg_lamda_ = grappa_reg_lamda_;
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( coil_compression_thres_>0 || coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+    workOrder->embedded_averageall_ref_ = embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = embedded_ref_fillback_;
+    workOrder->separate_averageall_ref_ = separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+    workOrder->interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+
+    worker_grappa_.performTiming_ = true;
+    worker_grappa_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.worker_ = &worker_grappa_;
+    workflow_.workOrder_ = workOrder;
+
+    workflow_.preProcessing();
+    workflow_.recon();
+    workflow_.postProcessing();
+
+    gt_io.exportArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"StdandardDataR2_res");
+
+    workflow_.res_.squeeze();
+    gt_io.export3DArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"StdandardDataR2_res_squeezed");
+
+    hoNDArray<T> std;
+    bool NMinusOne = true;
+    stdOver3rdDimension(workflow_.res_, std, NMinusOne);
+    gt_io.export2DArrayComplex(std, this->gtPluse_ut_res_folder_+"StdandardDataR2_res_squeezed_std");
+}
+
+TYPED_TEST(gtPlus_grappa_Test, reconWorker2DTGRAPPA)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<float> real_data;
+    std::string filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_real";
+    gt_io.importArray(real_data, filename);
+    real_data.print(std::cout);
+
+    hoNDArray<float> imag_data;
+    filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_imag";
+    gt_io.importArray(imag_data, filename);
+    imag_data.print(std::cout);
+
+    boost::shared_ptr< hoNDArray<GT_Complex8> > tmp = real_imag_to_complex<GT_Complex8>(&real_data, &imag_data);
+
+    unsigned long long RO = tmp->get_size(0);
+    unsigned long long E1 = tmp->get_size(1);
+    unsigned long long CHA = tmp->get_size(2);
+    unsigned long long PHS = tmp->get_size(3);
+
+    unsigned long long reconE1 = 120;
+
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    unsigned long long SLC = 1;
+    unsigned long long E2 = 1;
+    unsigned long long CON = 1;
+    unsigned long long REP = 1;
+    unsigned long long SET = 1;
+    unsigned long long SEG = 1;
+
+    hoNDArray<GT_Complex8> kspace(RO, E1, CHA, SLC, E2, CON, PHS, tmp->begin());
+
+    Gadgetron::norm2(kspace, v);
+    GADGET_MSG("kspace = " << v);
+
+    // ref
+    hoNDArray<float> real_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_real";
+    gt_io.importArray(real_ref, filename);
+    real_ref.print(std::cout);
+
+    hoNDArray<float> imag_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_imag";
+    gt_io.importArray(imag_ref, filename);
+    imag_ref.print(std::cout);
+
+    hoNDArray<T> ref;
+    real_imag_to_complex<GT_Complex8>(real_ref, imag_ref, ref);
+
+    Gadgetron::norm2(ref, v);
+    GADGET_MSG("ref = " << v);
+
+    // call the recon
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef std::pair<Gadgetron::gtPlus::ISMRMRDDIM, unsigned long long> DimensionRecordType;
+
+    WorkOrderType* workOrder = new WorkOrderType;
+
+    workOrder->data_ = kspace;
+    workOrder->ref_ = ref;
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    GADGET_MSG("[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << 1 << " " << 1 << " " << 1 << "]");
+
+    std::vector<size_t> dimensions_ = *dims;
+
+        // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTGRAPPA<ValueType> worker_grappa_;
+
+    // parameters
+    Gadgetron::gtPlus::ISMRMRDDIM dim_4th_ = DIM_Phase;
+    Gadgetron::gtPlus::ISMRMRDDIM dim_5th_ = DIM_Slice;
+    Gadgetron::gtPlus::ISMRMRDDIM workOrder_ShareDim_ = DIM_NONE;
+
+    bool interleaved_same_combinationcoeff_allS_ = false;
+    int interleaved_whichS_combinationcoeff_ = 0;
+
+    bool embedded_averageall_ref_ = false;
+    bool embedded_fullres_coilmap_ = true;
+    bool embedded_same_combinationcoeff_allS_ = false;
+    int embedded_whichS_combinationcoeff_ = 0;
+    bool embedded_ref_fillback_ = true;
+
+    bool separate_averageall_ref_ = false;
+    bool separate_fullres_coilmap_ = true;
+    bool separate_same_combinationcoeff_allS_ = false;
+    int separate_whichS_combinationcoeff_ = 0;
+
+    bool same_coil_compression_coeff_allS_ = true;
+    bool downstream_coil_compression_ = true;
+    double coil_compression_thres_ = 1e-3;
+    int coil_compression_num_modesKept_ = -1;
+
+    unsigned long long csm_kSize_ = 7;
+    unsigned long long csm_powermethod_num_ = 3;
+
+    Gadgetron::gtPlus::ISMRMRDALGO recon_algorithm_ = ISMRMRD_GRAPPA;
+    bool recon_kspace_needed_ = true;
+
+    unsigned long long grappa_kSize_RO_ = 5;
+    unsigned long long grappa_kSize_E1_ = 4;
+    unsigned long long grappa_kSize_E2_ = 4;
+    double grappa_reg_lamda_ = 1e-4;
+
+    // recon
+    workflow_.setDataArray(kspace);
+    workflow_.setRefArray(ref);
+
+    Gadgetron::norm2(workOrder->data_, v); GADGET_MSG("workOrder->data_ = " << v);
+    Gadgetron::norm2(workOrder->ref_, v); GADGET_MSG("workOrder->ref_ = " << v);
+
+    workflow_.reconSizeRO_ = RO;
+    workflow_.reconSizeE1_ = reconE1;
+    workflow_.reconSizeE2_ = 1;
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = dim_4th_;
+    workflow_.dim5th_ = dim_5th_;
+
+    workOrder->CalibMode_ = ISMRMRD_separate;
+    workOrder->start_RO_ = 34;
+    workOrder->end_RO_ = RO-1;
+    workOrder->acceFactorE1_ = 4;
+    workOrder->acceFactorE2_ = 1;
+
+    workOrder->downstream_coil_compression_ = downstream_coil_compression_;
+    workOrder->coil_compression_thres_ = coil_compression_thres_;
+    workOrder->coil_compression_num_modesKept_ = coil_compression_num_modesKept_;
+    workOrder->csm_kSize_ = csm_kSize_;
+    workOrder->csm_powermethod_num_ = csm_powermethod_num_;
+    workOrder->grappa_kSize_RO_ = grappa_kSize_RO_;
+    workOrder->grappa_kSize_E1_ = grappa_kSize_E1_;
+    workOrder->grappa_kSize_E2_ = grappa_kSize_E2_;
+    workOrder->grappa_reg_lamda_ = grappa_reg_lamda_;
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( coil_compression_thres_>0 || coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+    workOrder->embedded_averageall_ref_ = embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = embedded_ref_fillback_;
+    workOrder->separate_averageall_ref_ = separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+    workOrder->interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+
+    worker_grappa_.performTiming_ = true;
+    worker_grappa_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.debugFolder_ = this->gtPluse_ut_res_folder_;
+    workflow_.worker_ = &worker_grappa_;
+    workflow_.workOrder_ = workOrder;
+
+    gt_io.exportArrayComplex(workflow_.workOrder_->ref_, this->gtPluse_ut_res_folder_+"ref");
+
+    workflow_.preProcessing();
+    workflow_.recon();
+    workflow_.postProcessing();
+
+    gt_io.exportArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"grappa2D_gtPlus_res");
+}
+
+TYPED_TEST(gtPlus_grappa_Test, grappa2D)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<float> real_data;
+    std::string filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_real";
+    gt_io.importArray(real_data, filename);
+    real_data.print(std::cout);
+
+    hoNDArray<float> imag_data;
+    filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_imag";
+    gt_io.importArray(imag_data, filename);
+    imag_data.print(std::cout);
+
+    boost::shared_ptr< hoNDArray<GT_Complex8> > tmp = real_imag_to_complex<GT_Complex8>(&real_data, &imag_data);
+
+    unsigned long long RO = tmp->get_size(0);
+    unsigned long long E1 = tmp->get_size(1);
+    unsigned long long CHA = tmp->get_size(2);
+    unsigned long long PHS = tmp->get_size(3);
+
+    hoNDArray<GT_Complex8> kspace(RO, E1, CHA, PHS, tmp->begin());
+
+    // ref
+    hoNDArray<float> real_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_real";
+    gt_io.importArray(real_ref, filename);
+    real_ref.print(std::cout);
+
+    hoNDArray<float> imag_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_imag";
+    gt_io.importArray(imag_ref, filename);
+    imag_ref.print(std::cout);
+
+    hoNDArray<T> ref;
+    real_imag_to_complex<GT_Complex8>(real_ref, imag_ref, ref);
+
+    Gadgetron::norm2(ref, v);
+    GADGET_MSG("ref = " << v);
+
+    // recon
+    gtPlusISMRMRDReconUtil<GT_Complex8> util;
+    gtPlusISMRMRDReconUtilComplex<GT_Complex8> utilCplx;
+
+    // sum of square
+    hoNDArray<GT_Complex8> complexIm, sosIm;
+
+    GadgetronTimer timer(false);
+    timer.start("ifft2c");
+    hoNDFFT<float>::instance()->ifft2c(kspace, complexIm);
+    timer.stop();
+
+    timer.start("sumOfSquare");
+    utilCplx.sumOfSquare(complexIm, sosIm);
+    timer.stop();
+
+    hoNDArray<float> magSoS;
+    timer.start("absolute");
+    Gadgetron::absolute(sosIm, magSoS);
+    timer.stop();
+
+    filename = this->gtPluse_ut_res_folder_ + "SoS";
+    gt_io.exportArray(magSoS, filename);
+
+    // coil map estimation
+    hoNDFFT<float>::instance()->ifft2c(ref, complexIm);
+
+    filename = this->gtPluse_ut_res_folder_ + "complexIm";
+    gt_io.export3DArrayComplex(complexIm, filename);
+
+    hoNDArray<GT_Complex8> coilMap;
+    timer.start("coilMap2DNIH");
+    utilCplx.coilMap2DNIH(complexIm, coilMap, ISMRMRD_SOUHEIL, 7, 3, 3, true);
+    timer.stop();
+
+    filename = this->gtPluse_ut_res_folder_ + "coilMap";
+    gt_io.export3DArrayComplex(coilMap, filename);
+
+    // grappa kernel estimation
+    gtPlusReconWorker2DTGRAPPA<T> grappa;
+
+    unsigned long long kRO = 5;
+    unsigned long long kNE1 = 4;
+    unsigned long long srcCHA = CHA;
+    unsigned long long dstCHA = CHA;
+
+    double grappa_reg_lamda_ = 1e-4;
+
+    ho3DArray<T> acsSrc(RO, E1, CHA, const_cast<T*>(ref.begin()));
+    ho3DArray<T> acsDst(RO, E1, CHA, const_cast<T*>(ref.begin()));
+
+    Gadgetron::norm2(acsSrc, v);
+    GADGET_MSG("acsSrc = " << v);
+
+    std::vector<int> kE1, oE1;
+    int accelFactor = 4;
+    bool fitItself = true;
+
+    grappa.grappa_.kerPattern(kE1, oE1, accelFactor, kNE1, fitItself);
+
+    ho5DArray<T> ker(kRO, kNE1, srcCHA, dstCHA, oE1.size());
+    timer.start("grappa.calib");
+    grappa.grappa_.calib(acsSrc, acsDst, grappa_reg_lamda_, kRO, kE1, oE1, ker);
+    timer.stop();
+
+    Gadgetron::norm2(ker, v);
+    GADGET_MSG("ker = " << v);
+    gt_io.exportArrayComplex(ker, this->gtPluse_ut_res_folder_ + "ker");
+
+    ho4DArray<T> kIm(RO, E1, srcCHA, dstCHA);
+    timer.start("grappa.imageDomainKernel");
+    grappa.grappa_.imageDomainKernel(ker, kRO, kE1, oE1, RO, E1, kIm);
+    timer.stop();
+    gt_io.exportArrayComplex(kIm, this->gtPluse_ut_res_folder_ + "kIm");
+
+    Gadgetron::norm2(kIm, v);
+    GADGET_MSG("kIm = " << v);
+
+    ho3DArray<T> unmixC(RO, E1, srcCHA);
+    ho2DArray<T> gFactor(RO, E1);
+
+    ho3DArray<T> coilMap2(RO, E1, dstCHA, coilMap.begin());
+
+    Gadgetron::norm2(coilMap2, v);
+    GADGET_MSG("coilMap2 = " << v);
+
+    grappa.unmixCoeff(kIm, coilMap2, unmixC, gFactor);
+
+    Gadgetron::norm2(unmixC, v);
+    GADGET_MSG("unmixC = " << v);
+
+    gt_io.export3DArrayComplex(unmixC, this->gtPluse_ut_res_folder_ + "unmixC");
+    gt_io.export2DArrayComplex(gFactor, this->gtPluse_ut_res_folder_ + "gFactor");
+
+    // unwarpping
+    hoNDArray<T> res;
+    grappa.applyImageDomainKernel(kspace, kIm, res);
+    gt_io.export3DArrayComplex(res, this->gtPluse_ut_res_folder_ + "grappa2D_res");
+
+    grappa.applyUnmixCoeff(kspace, unmixC, res);
+    gt_io.export2DArrayComplex(res, this->gtPluse_ut_res_folder_ + "res_unmixC");
+}
diff --git a/toolboxes/gtplus/ut/gtplus_ut.cpp b/toolboxes/gtplus/ut/gtplus_ut.cpp
new file mode 100644
index 0000000..f338482
--- /dev/null
+++ b/toolboxes/gtplus/ut/gtplus_ut.cpp
@@ -0,0 +1,16 @@
+/*
+ * tests.cpp
+ *
+ *  Created on: Feb 28, 2013
+ *      Author: Dae
+ */
+
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv)
+{
+    //::testing::GTEST_FLAG(filter) = "*grappa*:*spirit*";
+
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/toolboxes/gtplus/ut/spirit_test.cpp b/toolboxes/gtplus/ut/spirit_test.cpp
new file mode 100644
index 0000000..554285f
--- /dev/null
+++ b/toolboxes/gtplus/ut/spirit_test.cpp
@@ -0,0 +1,425 @@
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif // USE_OMP
+
+#include "Gadget.h"
+#include "ismrmrd.h"
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+
+#include "hoNDArray_utils.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+// #include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusMemoryManager.h"
+#include "hoNDArrayMemoryManaged.h"
+#include "gtPlusSPIRIT2DOperator.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRIT3DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace3DOperator.h"
+#include "gtPlusLSQRSolver.h"
+#include "gtPlusNCGSolver.h"
+#include "gtPlusWavelet2DOperator.h"
+#include "gtPlusWavelet3DOperator.h"
+#include "gtPlusWaveletNoNullSpace2DOperator.h"
+#include "gtPlusWaveletNoNullSpace3DOperator.h"
+#include "gtPlusDataFidelityOperator.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusISMRMRDReconWorker3DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker3DTNoAcceleration.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h"
+#include "gtPlusMemoryManager.h"
+
+#include "GadgetronTimer.h"
+
+#include <boost/thread/mutex.hpp>
+
+#ifdef max
+#undef max
+#endif // max
+
+using namespace Gadgetron;
+using namespace Gadgetron::gtPlus;
+using testing::Types;
+
+template <typename T> class gtPlus_spirit_Test : public ::testing::Test 
+{
+protected:
+    virtual void SetUp()
+    {
+        GADGET_MSG("=============================================================================================");
+        gtPluse_ut_folder_ = std::string(::getenv("GTPLUS_UNITTEST_DIRECTORY"));
+        GADGET_MSG("=============================================================================================");
+        GADGET_MSG("Unit Test for GtPlus");
+        gtPluse_ut_data_folder_ = gtPluse_ut_folder_ + "/data/";
+        gtPluse_ut_res_folder_ = gtPluse_ut_folder_ + "/result/";
+        GADGET_MSG("gtPluse_ut_data_folder_ is " << gtPluse_ut_data_folder_);
+        GADGET_MSG("gtPluse_ut_res_folder_ is " << gtPluse_ut_res_folder_);
+
+        timer_.set_timing_in_destruction(false);
+
+#ifdef WIN32
+    #ifdef USE_OMP
+        /// lock the threads
+        #pragma omp parallel default(shared)
+        {
+            int tid = omp_get_thread_num();
+            // std::cout << tid << std::endl;
+            DWORD_PTR mask = (1 << tid);
+            SetThreadAffinityMask( GetCurrentThread(), mask );
+        }
+    #endif // USE_OMP
+#endif // WIN32
+    }
+
+    std::string gtPluse_ut_folder_;
+    std::string gtPluse_ut_data_folder_;
+    std::string gtPluse_ut_res_folder_;
+
+    gtPlusIOAnalyze gt_io_;
+    gtPlusISMRMRDReconUtil<T> util_;
+    gtPlusISMRMRDReconUtilComplex<T> utilCplx_;
+    GadgetronTimer timer_;
+};
+
+typedef Types<float, double> realImplementations;
+
+typedef Types< std::complex<float> > cpfloatImplementations;
+
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+typedef Types<std::complex<float>, std::complex<double> > stdCplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(gtPlus_spirit_Test, cpfloatImplementations);
+
+TYPED_TEST(gtPlus_spirit_Test, reconWorker2DTSPIRIT)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<float> real_data;
+    std::string filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_real";
+    gt_io.importArray(real_data, filename);
+    real_data.print(std::cout);
+
+    hoNDArray<float> imag_data;
+    filename = this->gtPluse_ut_data_folder_ + "underSampledKSpace_imag";
+    gt_io.importArray(imag_data, filename);
+    imag_data.print(std::cout);
+
+    boost::shared_ptr< hoNDArray<GT_Complex8> > tmp = real_imag_to_complex<GT_Complex8>(&real_data, &imag_data);
+
+    unsigned long long RO = tmp->get_size(0);
+    unsigned long long E1 = tmp->get_size(1);
+    unsigned long long CHA = tmp->get_size(2);
+    unsigned long long PHS = tmp->get_size(3);
+
+    unsigned long long reconE1 = 120;
+
+    // [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    unsigned long long SLC = 1;
+    unsigned long long E2 = 1;
+    unsigned long long CON = 1;
+    unsigned long long REP = 1;
+    unsigned long long SET = 1;
+    unsigned long long SEG = 1;
+
+    hoNDArray<GT_Complex8> kspace(RO, E1, CHA, SLC, E2, CON, PHS, tmp->begin());
+
+    Gadgetron::norm2(kspace, v);
+    GADGET_MSG("kspace = " << v);
+
+    // ref
+    hoNDArray<float> real_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_real";
+    gt_io.importArray(real_ref, filename);
+    real_ref.print(std::cout);
+
+    hoNDArray<float> imag_ref;
+    filename = this->gtPluse_ut_data_folder_ + "ref_imag";
+    gt_io.importArray(imag_ref, filename);
+    imag_ref.print(std::cout);
+
+    hoNDArray<T> ref;
+    real_imag_to_complex<GT_Complex8>(real_ref, imag_ref, ref);
+
+    Gadgetron::norm2(ref, v);
+    GADGET_MSG("ref = " << v);
+
+    // call the recon
+    typedef std::complex<float> ValueType;
+    typedef Gadgetron::gtPlus::gtPlusReconWorkOrder2DT<ValueType> WorkOrderType;
+    typedef std::pair<Gadgetron::gtPlus::ISMRMRDDIM, unsigned long long> DimensionRecordType;
+
+    WorkOrderType* workOrder = new WorkOrderType;
+
+    workOrder->data_ = kspace;
+    workOrder->ref_ = ref;
+
+    boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+
+    GADGET_MSG("[Ro E1 Cha Slice E2 Con Phase Rep Set Seg] = [" 
+        << (*dims)[0] << " " << (*dims)[1] << " " << (*dims)[2] << " " << (*dims)[3] << " " << (*dims)[4] 
+        << " " << (*dims)[5] << " " << (*dims)[6] << " " << 1 << " " << 1 << " " << 1 << "]");
+
+    std::vector<size_t> dimensions_ = *dims;
+
+        // work flow
+    Gadgetron::gtPlus::gtPlusISMRMRDReconWorkFlowCartesian2DT<ValueType> workflow_;
+
+    // worker
+    Gadgetron::gtPlus::gtPlusReconWorker2DTSPIRIT<ValueType> worker_spirit_;
+
+    // parameters
+    Gadgetron::gtPlus::ISMRMRDDIM dim_4th_ = DIM_Phase;
+    Gadgetron::gtPlus::ISMRMRDDIM dim_5th_ = DIM_Slice;
+    Gadgetron::gtPlus::ISMRMRDDIM workOrder_ShareDim_ = DIM_NONE;
+
+    bool interleaved_same_combinationcoeff_allS_ = false;
+    int interleaved_whichS_combinationcoeff_ = 0;
+
+    bool embedded_averageall_ref_ = false;
+    bool embedded_fullres_coilmap_ = true;
+    bool embedded_same_combinationcoeff_allS_ = false;
+    int embedded_whichS_combinationcoeff_ = 0;
+    bool embedded_ref_fillback_ = true;
+
+    bool separate_averageall_ref_ = false;
+    bool separate_fullres_coilmap_ = true;
+    bool separate_same_combinationcoeff_allS_ = false;
+    int separate_whichS_combinationcoeff_ = 0;
+
+    bool same_coil_compression_coeff_allS_ = true;
+    bool downstream_coil_compression_ = true;
+    double coil_compression_thres_ = 1e-3;
+    int coil_compression_num_modesKept_ = -1;
+
+    unsigned long long csm_kSize_ = 7;
+    unsigned long long csm_powermethod_num_ = 3;
+
+    Gadgetron::gtPlus::ISMRMRDALGO recon_algorithm_ = ISMRMRD_SPIRIT;
+    bool recon_kspace_needed_ = true;
+
+    unsigned long long spirit_kSize_RO_ = 5;
+    unsigned long long spirit_kSize_E1_ = 5;
+    unsigned long long spirit_kSize_E2_ = 5;
+
+    double spirit_reg_lamda_ = 0.005;
+    unsigned long long spirit_iter_max_ = 100;
+    double spirit_iter_thres_ = 1e-5;
+
+    // recon
+    workflow_.setDataArray(kspace);
+    workflow_.setRefArray(ref);
+
+    Gadgetron::norm2(workOrder->data_, v); GADGET_MSG("workOrder->data_ = " << v);
+    Gadgetron::norm2(workOrder->ref_, v); GADGET_MSG("workOrder->ref_ = " << v);
+
+    workflow_.reconSizeRO_ = RO;
+    workflow_.reconSizeE1_ = reconE1;
+    workflow_.reconSizeE2_ = 1;
+    // workflow_.dataDimStartingIndexes_ = workOrder->dataDimStartingIndexes_;
+    workflow_.dim4th_ = dim_4th_;
+    workflow_.dim5th_ = dim_5th_;
+
+    workOrder->CalibMode_ = ISMRMRD_separate;
+    workOrder->start_RO_ = 34;
+    workOrder->end_RO_ = RO-1;
+    workOrder->acceFactorE1_ = 4;
+    workOrder->acceFactorE2_ = 1;
+
+    workOrder->downstream_coil_compression_ = downstream_coil_compression_;
+    workOrder->coil_compression_thres_ = coil_compression_thres_;
+    workOrder->coil_compression_num_modesKept_ = coil_compression_num_modesKept_;
+    workOrder->csm_kSize_ = csm_kSize_;
+    workOrder->csm_powermethod_num_ = csm_powermethod_num_;;
+
+    workOrder->recon_algorithm_ = recon_algorithm_;
+
+    workOrder->spirit_kSize_RO_ = spirit_kSize_RO_;
+    workOrder->spirit_kSize_E1_ = spirit_kSize_E1_;
+    workOrder->spirit_kSize_E2_ = spirit_kSize_E2_;
+    workOrder->spirit_reg_lamda_ = spirit_reg_lamda_;
+    workOrder->spirit_iter_max_ = spirit_iter_max_;
+    workOrder->spirit_iter_thres_ = spirit_iter_thres_;
+
+    workOrder->recon_kspace_needed_ = recon_kspace_needed_;
+
+    if ( coil_compression_thres_>0 || coil_compression_num_modesKept_>0 )
+    {
+        workOrder->coil_compression_ = true;
+    }
+    else
+    {
+        workOrder->coil_compression_ = false;
+    }
+
+    workOrder->same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+    workOrder->embedded_averageall_ref_ = embedded_averageall_ref_;
+    workOrder->embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    workOrder->embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    workOrder->embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    workOrder->embedded_ref_fillback_ = embedded_ref_fillback_;
+    workOrder->separate_averageall_ref_ = separate_averageall_ref_;
+    workOrder->separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    workOrder->separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    workOrder->separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+    workOrder->interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    workOrder->interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+
+    worker_spirit_.performTiming_ = true;
+    worker_spirit_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    workflow_.debugFolder_ = this->gtPluse_ut_res_folder_;
+    workflow_.worker_ = &worker_spirit_;
+    workflow_.workOrder_ = workOrder;
+
+    gt_io.exportArrayComplex(workflow_.workOrder_->ref_, this->gtPluse_ut_res_folder_+"ref");
+
+    boost::shared_ptr<Gadgetron::gtPlus::gtPlusMemoryManager> mem_manager_(new Gadgetron::gtPlus::gtPlusMemoryManager(4, 640*1024*1024));
+    worker_spirit_.gtPlus_mem_manager_ = mem_manager_;
+
+    workflow_.preProcessing();
+    workflow_.recon();
+    workflow_.postProcessing();
+
+    gt_io.exportArrayComplex(workflow_.res_, this->gtPluse_ut_res_folder_+"spirit2D_gtPlus_res");
+}
+
+TYPED_TEST(gtPlus_spirit_Test, testNCGSolver2DTSPIRIT_neuro_3by3)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<float> real_data;
+    std::string filename = this->gtPluse_ut_data_folder_ + "Job2DT_kspace_ID6_REAL";
+    gt_io.importArray(real_data, filename);
+    real_data.print(std::cout);
+
+    hoNDArray<float> imag_data;
+    filename = this->gtPluse_ut_data_folder_ + "Job2DT_kspace_ID6_IMAG";
+    gt_io.importArray(imag_data, filename);
+    imag_data.print(std::cout);
+
+    boost::shared_ptr< hoNDArray<GT_Complex8> > kspace = real_imag_to_complex<GT_Complex8>(&real_data, &imag_data);
+
+    hoNDArray<float> real_ker;
+    filename = this->gtPluse_ut_data_folder_ + "Job2DT_ker_ID6_REAL";
+    gt_io.importArray(real_ker, filename);
+    real_ker.print(std::cout);
+
+    hoNDArray<float> imag_ker;
+    filename = this->gtPluse_ut_data_folder_ + "Job2DT_ker_ID6_IMAG";
+    gt_io.importArray(imag_ker, filename);
+    imag_ker.print(std::cout);
+
+    boost::shared_ptr< hoNDArray<GT_Complex8> > ker = real_imag_to_complex<GT_Complex8>(&real_ker, &imag_ker);
+
+    hoNDArray<float> real_kspaceLinear;
+    filename = this->gtPluse_ut_data_folder_ + "Job2DT_kspaceLinear_ID6_REAL";
+    gt_io.importArray(real_kspaceLinear, filename);
+    real_kspaceLinear.print(std::cout);
+
+    hoNDArray<float> imag_kspaceLinear;
+    filename = this->gtPluse_ut_data_folder_ + "Job2DT_kspaceLinear_ID6_IMAG";
+    gt_io.importArray(imag_kspaceLinear, filename);
+    imag_kspaceLinear.print(std::cout);
+
+    boost::shared_ptr< hoNDArray<GT_Complex8> > kspaceLinear = real_imag_to_complex<GT_Complex8>(&real_kspaceLinear, &imag_kspaceLinear);
+
+    Gadgetron::gtPlus::gtPlusReconWorker3DTL1SPIRITNCG<GT_Complex8> worker_spirit_L1_ncg_;
+    worker_spirit_L1_ncg_.performTiming_ = true;
+    worker_spirit_L1_ncg_.debugFolder_ = this->gtPluse_ut_res_folder_;
+
+    Gadgetron::gtPlus::gtPlusReconJob2DT< std::complex<float> > job;
+
+    job.kspace = *kspace;
+    job.ker = *ker;
+
+    job.workOrder2DT.CalibMode_ = ISMRMRD_embedded;
+    job.workOrder2DT.InterleaveDim_ = DIM_Phase;
+
+    job.workOrder2DT.acceFactorE1_ = 3;
+    job.workOrder2DT.acceFactorE2_ = 3;
+
+    job.workOrder2DT.kSpaceCenterRO_ = 128;
+    job.workOrder2DT.kSpaceCenterEncode1_ = 127;
+    job.workOrder2DT.kSpaceCenterEncode2_ = 96;
+
+    job.workOrder2DT.kSpaceMaxRO_ = 256;
+    job.workOrder2DT.kSpaceMaxEncode1_ = 255;
+    job.workOrder2DT.kSpaceMaxEncode2_ = 191;
+
+    job.workOrder2DT.recon_algorithm_ = ISMRMRD_L1SPIRIT;
+    job.workOrder2DT.recon_auto_parameters_ = false;
+
+    job.workOrder2DT.spirit_kSize_RO_ = 7;
+    job.workOrder2DT.spirit_kSize_E1_ = 7;
+    job.workOrder2DT.spirit_kSize_E2_ = 5;
+
+    job.workOrder2DT.spirit_reg_lamda_ = 0.01;
+    job.workOrder2DT.spirit_calib_over_determine_ratio_ = 15;
+
+    job.workOrder2DT.spirit_solve_symmetric_ = false;
+
+    job.workOrder2DT.spirit_iter_max_ = 100;
+    job.workOrder2DT.spirit_iter_thres_ = 0.005;
+    job.workOrder2DT.spirit_print_iter_ = true;
+
+    job.workOrder2DT.spirit_perform_linear_ = true;
+    job.workOrder2DT.spirit_perform_nonlinear_ = true;
+
+    job.workOrder2DT.spirit_parallel_imaging_lamda_ = 1;
+    job.workOrder2DT.spirit_image_reg_lamda_ = 0.0025;
+    job.workOrder2DT.spirit_data_fidelity_lamda_ = 0;
+
+    job.workOrder2DT.spirit_ncg_iter_max_ = 10;
+    job.workOrder2DT.spirit_ncg_iter_thres_ = 0.001;
+    job.workOrder2DT.spirit_ncg_print_iter_ = true;
+    job.workOrder2DT.spirit_ncg_scale_factor_ = 1;
+
+    job.workOrder2DT.spirit_use_coil_sen_map_ = false;
+    job.workOrder2DT.spirit_use_moco_enhancement_ = false;
+    job.workOrder2DT.spirit_recon_moco_images_ = false;
+
+    job.workOrder2DT.spirit_temporal_enhancement_ratio_ = 5;
+    job.workOrder2DT.spirit_2D_scale_per_chunk_ = false;
+
+    job.workOrder2DT.spirit_E2_enhancement_ratio_ = 1.0;
+    job.workOrder2DT.spirit_3D_scale_per_chunk_ = false;
+
+    bool succeed = true;
+    GADGET_START_TIMING_CONDITION(this->timer_, "Recon 2DT job ... ", true);
+
+    job.res = job.kspace;
+
+    worker_spirit_L1_ncg_.performUnwarppingImplROPermuted(&(job.workOrder2DT), job.kspace, job.ker, *job.workOrder2DT.coilMap_, job.res);
+    // worker_spirit_L1_ncg_.performUnwarppingImplROPermuted(&(job.workOrder2DT), job.kspace, job.ker, *job.workOrder2DT.coilMap_, *kspaceLinear, job.res);
+
+    // succeed = worker_spirit_L1_ncg_.performUnwarppingImpl(job);
+
+    GADGET_STOP_TIMING_CONDITION(this->timer_, true);
+
+    gt_io.exportArrayComplex(job.res, this->gtPluse_ut_res_folder_+"NCGSolver2DTSPIRIT_neuro_3by3_res");
+}
diff --git a/toolboxes/gtplus/ut/util_test.cpp b/toolboxes/gtplus/ut/util_test.cpp
new file mode 100644
index 0000000..c36ed82
--- /dev/null
+++ b/toolboxes/gtplus/ut/util_test.cpp
@@ -0,0 +1,1195 @@
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif // USE_OMP
+
+#include "Gadget.h"
+#include "Gadgetron.h"
+#include "ismrmrd.h"
+#include "hoNDArray_elemwise.h"
+#include "complext.h"
+
+#include <gtest/gtest.h>
+
+#include "hoNDArray_utils.h"
+
+#include "gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+// #include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusISMRMRDReconWorker2DTGRAPPA.h"
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian2DT.h"
+#include "gtPlusISMRMRDReconWorkFlowCartesian3DT.h"
+#include "gtPlusMemoryManager.h"
+#include "hoNDArrayMemoryManaged.h"
+#include "gtPlusSPIRIT2DOperator.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRIT3DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace3DOperator.h"
+#include "gtPlusNCGSolver.h"
+
+#include "GadgetronTimer.h"
+
+#include <boost/thread/mutex.hpp>
+
+#ifdef max
+#undef max
+#endif // max
+
+using namespace Gadgetron;
+using namespace Gadgetron::gtPlus;
+using testing::Types;
+
+template <typename T> class gtPlus_IO_Test : public ::testing::Test 
+{
+protected:
+    virtual void SetUp()
+    {
+        GADGET_MSG("=============================================================================================");
+        gtPluse_ut_folder_ = std::string(::getenv("GTPLUS_UNITTEST_DIRECTORY"));
+        GADGET_MSG("=============================================================================================");
+        GADGET_MSG("Unit Test for GtPlus");
+        gtPluse_ut_data_folder_ = gtPluse_ut_folder_ + "/data/";
+        gtPluse_ut_res_folder_ = gtPluse_ut_folder_ + "/result/";
+        GADGET_MSG("gtPluse_ut_data_folder_ is " << gtPluse_ut_data_folder_);
+        GADGET_MSG("gtPluse_ut_res_folder_ is " << gtPluse_ut_res_folder_);
+
+        timer_.set_timing_in_destruction(false);
+
+#ifdef WIN32
+    #ifdef USE_OMP
+        /// lock the threads
+        #pragma omp parallel default(shared)
+        {
+            int tid = omp_get_thread_num();
+            // std::cout << tid << std::endl;
+            DWORD_PTR mask = (1 << tid);
+            SetThreadAffinityMask( GetCurrentThread(), mask );
+        }
+    #endif // USE_OMP
+#endif // WIN32
+    }
+
+    std::string gtPluse_ut_folder_;
+    std::string gtPluse_ut_data_folder_;
+    std::string gtPluse_ut_res_folder_;
+
+    gtPlusIOAnalyze gt_io_;
+    gtPlusISMRMRDReconUtil<T> util_;
+    gtPlusISMRMRDReconUtilComplex<T> utilCplx_;
+    GadgetronTimer timer_;
+};
+
+typedef Types<float, double> realImplementations;
+
+typedef Types< std::complex<float> > cpfloatImplementations;
+
+typedef Types<std::complex<float>, std::complex<double>, float_complext, double_complext> cplxImplementations;
+typedef Types<std::complex<float>, std::complex<double> > stdCplxImplementations;
+typedef Types<float_complext, double_complext> cplxtImplementations;
+
+TYPED_TEST_CASE(gtPlus_IO_Test, cpfloatImplementations);
+
+TYPED_TEST(gtPlus_IO_Test, recon2DCoilMapGPU)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<GT_Complex8> data;
+    // gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "fullkspace__REAL", this->gtPluse_ut_data_folder_ + "fullkspace__IMAG");
+    gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "aveComplexIm_REAL", this->gtPluse_ut_data_folder_ + "aveComplexIm_IMAG");
+    data.print(std::cout);
+
+    data.squeeze();
+
+    GadgetronTimer timer(false);
+
+    unsigned int RO = data.get_size(0);
+    unsigned int E1 = data.get_size(1);
+    unsigned int CHA = data.get_size(2);
+    unsigned int N = data.get_size(3);
+
+    Gadgetron::norm2(data, v);
+    GADGET_MSG("data = " << v);
+
+    {
+        GPUTimer t("all steps");
+    }
+
+    hoNDArray<GT_Complex8> data2D(RO, E1, CHA, data.begin());
+
+    hoNDArray<T> CoilMap2D;
+    timer.start("coilMap2DNIHGPU 2D");
+    gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIHGPU(data2D, CoilMap2D, ISMRMRD_SOUHEIL, 7, 3, 3, 1e-3);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, CoilMap2D, "CoilMap2D_1");
+
+    {
+    // call the old coil map code
+    timer.start("coilMap2DNIHGPU 2D old");
+    hoNDArray<float_complext> host_data(RO, E1, CHA, reinterpret_cast<float_complext*>(data2D.begin()));
+    cuNDArray<float_complext> device_data(host_data);
+    boost::shared_ptr< cuNDArray<float_complext> > csm = Gadgetron::estimate_b1_map<float, 2>( &device_data, CHA);
+    boost::shared_ptr< hoNDArray<float_complext> > csm_host = csm->to_host();
+    memcpy(CoilMap2D.begin(), csm_host->begin(), csm_host->get_number_of_bytes());
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, CoilMap2D, "CoilMap2D_1_old");
+    }
+
+    hoNDArray<T> CoilMap;
+    timer.start("coilMap2DNIHGPU");
+    gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIHGPU(data, CoilMap, ISMRMRD_SOUHEIL, 7, 3, 3, 1e-3);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, CoilMap, "CoilMap2D");
+
+    hoNDArray<T> CoilMap2;
+    timer.start("coilMap2DNIH");
+    gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(data2D, CoilMap2, ISMRMRD_SOUHEIL, 7, 3, 3, 1e-3, false);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, CoilMap2, "CoilMap2D_2");
+
+    hoNDArray<T> combined;
+    timer.start("coil combine");
+    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(data, CoilMap, combined);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, combined, "combined2D");
+
+    cudaDeviceReset();
+}
+
+TYPED_TEST(gtPlus_IO_Test, recon3DCoilMapGPU)
+{
+    typedef GT_Complex8 T;
+
+    gtPlusIOAnalyze gt_io;
+
+    float v;
+
+    // image data
+    hoNDArray<GT_Complex8> data;
+    gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "fullkspace__REAL", this->gtPluse_ut_data_folder_ + "fullkspace__IMAG");
+    data.print(std::cout);
+
+    data.squeeze();
+
+    GadgetronTimer timer(false);
+
+    unsigned int RO = data.get_size(0);
+    unsigned int E1 = data.get_size(1);
+    unsigned int E2 = data.get_size(2);
+    unsigned int CHA = data.get_size(3);
+
+    Gadgetron::norm2(data, v);
+    GADGET_MSG("data = " << v);
+
+    {
+        GPUTimer t("all steps");
+    }
+
+    hoNDArray<GT_Complex8> Im2;
+    timer.start("ifft3c");
+    hoNDFFT<float>::instance()->ifft3c(data, Im2);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, Im2, "Im2");
+
+    hoNDArray<T> CoilMap;
+    timer.start("coilMap3DNIHGPU");
+    gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(Im2, CoilMap, ISMRMRD_SOUHEIL, 7, 3, true);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, CoilMap, "CoilMap");
+
+    omp_set_nested(1);
+
+    hoNDArray<T> CoilMap2;
+    timer.start("coilMap3DNIH");
+    gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(Im2, CoilMap2, ISMRMRD_SOUHEIL, 7, 3, true);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, CoilMap2, "CoilMap2");
+
+    hoNDArray<T> combined;
+    timer.start("coil combine");
+    gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(Im2, CoilMap, combined);
+    timer.stop();
+    GADGET_EXPORT_ARRAY_COMPLEX(this->gtPluse_ut_res_folder_, gt_io, combined, "combined");
+
+    cudaDeviceReset();
+}
+
+//TYPED_TEST(gtPlus_IO_Test, reconCoilCompression)
+//{
+//    typedef float T;
+//    typedef std::complex<T> TValueType;
+//
+//    gtPlusIOAnalyze gt_io;
+//    GadgetronTimer timer(false);
+//
+//    gtPlusISMRMRDReconUtil<TValueType> util;
+//    gtPlusISMRMRDReconUtilComplex<TValueType> utilCplx;
+//    std::string filename;
+//
+//    hoNDArray<GT_Complex8> data;
+//    gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "refRecon_REAL", this->gtPluse_ut_data_folder_ + "refRecon_IMAG");
+//    data.print(std::cout);
+//
+//    // export images
+//    hoNDArray<GT_Complex8> complexIm;
+//    Gadgetron::hoNDFFT<T>::instance()->ifft2c(data, complexIm);
+//
+//    hoNDArray<TValueType> sos;
+//    utilCplx.sumOfSquare(complexIm, sos);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "refRecon_SoS";
+//    gt_io.exportArrayComplex(sos, filename);
+//
+//    hoMatrix<GT_Complex8> coeff, eigenValues;
+//    utilCplx.computeKLCoilCompressionCoeff(data, 1e-3, coeff, eigenValues);
+//    eigenValues.print(std::cout);
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, MatrixComputation)
+//{
+//    MKL_INT n = 4, nrhs = 2, ldb = 2;
+//
+//    /* Local arrays */
+//    //MKL_Complex8 a[16] = 
+//    //{
+//    //    { 5.96f,  0.00f}, { 0.40f,  -1.19f}, { -0.83f, -0.48f}, { -0.57f, 0.40f},
+//    //    { 0.40f,  1.19f}, { 7.95f,  0.00f}, { 0.33f,  0.09f}, { 0.22f, 0.74f},
+//    //    {-0.83f,  0.48f}, { 0.33f, -0.09f}, { 4.43f,  0.00f}, { -1.09f, 0.32f},
+//    //    {-0.57f, -0.40f}, { 0.22f, -0.74f}, {-1.09f, -0.32f}, { 3.46f,  0.00f}
+//    //};
+//
+//    //MKL_Complex8 b[8] = 
+//    //{
+//    //    {-2.94f,  5.79f}, { 8.44f,  3.07f},
+//    //    { 8.12f, -9.12f}, { 1.00f, -4.62f},
+//    //    { 9.09f, -5.03f}, { 3.64f, -2.33f},
+//    //    { 7.36f,  6.77f}, { 8.04f,  2.87f}
+//    //};
+//
+//    MKL_Complex8 a[16] = 
+//    {
+//        { 5.96f,  0.00f},   { 0.40f,  1.19f},   { -0.83f, 0.48f},   { -0.57f, -0.40f},
+//        { 0.40f,  -1.19f},  { 7.95f,  0.00f},   { 0.33f,  -0.09f},  { 0.22f, -0.74f},
+//        {-0.83f,  -0.48f},  { 0.33f, 0.09f},    { 4.43f,  0.00f},   { -1.09f, -0.32f},
+//        {-0.57f,  0.40f},   { 0.22f, 0.74f},    {-1.09f, 0.32f},    { 3.46f,  0.00f}
+//    };
+//
+//    MKL_Complex8 b[8] = 
+//    {
+//        {-2.94f,  5.79f}, { 8.12f, -9.12f}, { 9.09f, -5.03f}, { 7.36f,  6.77f}, 
+//        { 8.44f,  3.07f}, { 1.00f, -4.62f}, { 3.64f, -2.33f}, { 8.04f,  2.87f}
+//    };
+//
+//    hoMatrix< std::complex<float> > A(n, n, reinterpret_cast<std::complex<float>*>(a));
+//    hoMatrix< std::complex<float> > B(n, ldb, reinterpret_cast<std::complex<float>*>(b));
+//
+//    hoMatrix< std::complex<float> > AB;
+//    GeneralMatrixProduct_gemm(AB, A, false, B, false);
+//    AB.print(std::cout);
+//
+//    GeneralMatrixProduct_gemm(AB, A, true, B, false);
+//    AB.print(std::cout);
+//
+//    //A*B
+//    //ans =
+//    //                   -41.9895 +               20.0944i                    35.3342 +                17.026i
+//    //                    56.5497 -               67.5926i                     8.7286 -               19.3177i
+//    //                    31.5997 -               37.2643i                    -2.1214 -               10.9889i
+//    //                    12.9773 +               15.8586i                    16.3236 +                4.4228i
+//
+//    hoMatrix< std::complex<float> > A2(A);
+//    hoMatrix< std::complex<float> > B2(B);
+//
+//    SymmetricHermitianPositiveDefiniteLinearSystem_posv(A2, B2);
+//
+//    A2.print(std::cout);
+//    B2.print(std::cout);
+//
+//    //    Solution
+//    //    (  0.80,  1.62) (  2.52,  0.61)
+//    //    (  1.26, -1.78) (  0.01, -1.38)
+//    //    (  3.38, -0.29) (  2.42, -0.52)
+//    //    (  3.46,  2.92) (  3.77,  1.37)
+//
+//    //    Details of Cholesky factorization
+//    //    (  2.44,  0.00) (  0.00,  0.00) (  0.00,  0.00) (  0.00,  0.00)
+//    //    (  0.16,  0.49) (  2.77,  0.00) (  0.00,  0.00) (  0.00,  0.00)
+//    //    ( -0.34,  0.20) (  0.10, -0.10) (  2.06,  0.00) (  0.00,  0.00)
+//    //    ( -0.23, -0.16) (  0.12, -0.30) ( -0.57, -0.20) (  1.71,  0.00)
+//
+//    A2 = A;
+//    CholeskyHermitianPositiveDefinite_potrf(A2, 'L');
+//    A2.print(std::cout);
+//
+//    A2 = A;
+//    A2.print(std::cout);
+//
+//    hoMatrix< std::complex<float> > eigenValue;
+//    EigenAnalysis_syev_heev2(A2, eigenValue);
+//    A2.print(std::cout);
+//    eigenValue.print(std::cout);
+//
+//    hoMatrix< std::complex<float> > C;
+//    GeneralMatrixProduct_gemm(C, A2, false, A2, true);
+//    C.print(std::cout);
+//
+//    A2 = A;
+//    B2 = B;
+//    hoMatrix< std::complex<float> > x;
+//    double lamda = 1e-4;
+//    SolveLinearSystem_Tikhonov(A2, B2, x, lamda);
+//    x.print(std::cout);
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, memoryManager)
+//{
+//    typedef GT_Complex8 T;
+//
+//    unsigned int RO = 256;
+//    unsigned int E1 = 256;
+//    unsigned int E2 = 256;
+//    unsigned int CHA = 32;
+//
+//    size_t num = (size_t)RO*E1*E2*CHA*sizeof(T);
+//    std::cout << "Allocate " << num/1024/1024 << " MegaBytes ..." << std::endl;
+//
+//    GadgetronTimer timer(false);
+//
+//    timer.start("Allocate 2D array...");
+//    hoNDArray<T> a2D(RO, E1);
+//    timer.stop();
+//
+//    timer.start("Allocate 3D array...");
+//    hoNDArray<T> a3D(RO, E1, E2);
+//    timer.stop();
+//
+//    timer.start("Allocate 3D array...");
+//    T* p3D = new T[RO*E1*E2];
+//    timer.stop();
+//    memset(p3D, 0, sizeof(T)*RO*E1*E2);
+//    delete [] p3D;
+//
+//    timer.start("Allocate 3D array...");
+//    p3D = (T*)mkl_malloc(sizeof(T)*RO*E1*E2, 4);
+//    timer.stop();
+//    p3D[12] = T(2.3);
+//    memset(p3D, 0, sizeof(T)*RO*E1*E2);
+//
+//    timer.start("Allocate 4D array...");
+//    hoNDArray<T> a4D(RO, E1, E2, CHA);
+//    timer.stop();
+//
+//    timer.start("Allocate 4D array...");
+//    T* p4D = new T[RO*E1*E2*CHA];
+//    timer.stop();
+//    memset(p4D, 0, sizeof(T)*RO*E1*E2*CHA);
+//    delete [] p4D;
+//
+//    timer.start("Allocate 4D array...");
+//    p4D = (T*)mkl_malloc(sizeof(T)*RO*E1*E2*CHA, 4);
+//    timer.stop();
+//    p4D[12560] = T(2.3);
+//    timer.start("Allocate 4D array...");
+//    memset(p4D, 0, sizeof(T)*RO*E1*E2*CHA);
+//    timer.stop();
+//
+//    timer.start("Allocate ...");
+//    boost::shared_ptr<gtPlusMemoryManager> memMagnager(new gtPlusMemoryManager(4, num));
+//    timer.stop();
+//
+//    timer.start("Allocate 3 pieces ...");
+//    void* ptr = memMagnager->allocate(num/2);
+//    ptr = memMagnager->allocate(num/4);
+//    ptr = memMagnager->allocate(num/8);
+//    timer.stop();
+//
+//    memMagnager->printInfo(std::cout);
+//
+//    boost::shared_ptr<gtPlusMemoryManager> memMagnager2;
+//
+//    if ( memMagnager2 )
+//    {
+//        std::cout << "Test " << std::endl;
+//    }
+//
+//    if ( memMagnager )
+//    {
+//        std::cout << "Test " << std::endl;
+//    }
+//
+//    boost::mutex mutex_;
+//
+//    timer.start("mutex cost ...");
+//    mutex_.lock();
+//    mutex_.unlock();
+//    timer.stop();
+//
+//    std::cout << memMagnager.use_count() << std::endl;
+//
+//    timer.start("Allocate hoNDArrayMemoryManaged ...");
+//    mutex_.lock();
+//    Gadgetron::hoNDArrayMemoryManaged<T> a(256, 256, 128, memMagnager);
+//    mutex_.unlock();
+//    timer.stop();
+//
+//    std::cout << memMagnager.use_count() << std::endl;
+//
+//    memMagnager->printInfo(std::cout);
+//
+//    a.clear();
+//
+//    memMagnager->printInfo(std::cout);
+//
+//    int ii;
+//    #pragma omp parallel
+//    {
+//        Gadgetron::hoNDArrayMemoryManaged<T> b(256, 256, memMagnager);
+//    }
+//
+//    //timer.start("Allocate hoNDArrayMemoryManaged 2...");
+//    //Gadgetron::hoNDArrayMemoryManaged<T> b(256, 256, 128, *memMagnager);
+//    //timer.stop();
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, kspaceFilter)
+//{
+//    typedef GT_Complex8 T;
+//
+//    gtPlusIOAnalyze gt_io;
+//
+//    gtPlusISMRMRDReconUtil<T> util;
+//
+//    hoNDArray<T> filter;
+//
+//    unsigned int len = 12;
+//    double sigma = 1.5;
+//    unsigned int width = len*0.15;
+//
+//    ISMRMRDKSPACEFILTER filterType = ISMRMRD_FILTER_NONE;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_GAUSSIAN;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_HANNING;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TUKEY;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    len = 13;
+//
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_GAUSSIAN;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_HANNING;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TUKEY;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateSymmetricFilter(len, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    len = 13;
+//    unsigned int start = 0;
+//    unsigned int end = 9;
+//
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateAsymmetricFilter(len, start, end, filter, filterType, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateAsymmetricFilter(len, start, end, filter, filterType, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    start = 4;
+//    end = 12;
+//
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateAsymmetricFilter(len, start, end, filter, filterType, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateAsymmetricFilter(len, start, end, filter, filterType, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    len = 12;
+//
+//    start = 0;
+//    end = 9;
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_GAUSSIAN;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TUKEY;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    start = 4;
+//    end = len-1;
+//
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_GAUSSIAN;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TUKEY;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    len = 13;
+//
+//    start = 0;
+//    end = 9;
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_GAUSSIAN;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TUKEY;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//
+//    start = 4;
+//    end = len-1;
+//
+//    filterType = ISMRMRD_FILTER_NONE;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_GAUSSIAN;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TUKEY;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+//    util.generateSymmetricFilterForRef(len, start, end, filter, filterType, sigma, width);
+//    filter.printContent(std::cout);
+//
+//    GADGET_MSG("------------------------------------------------");
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, FFT)
+//{
+//    {
+//        hoNDArray< std::complex<float> > A1D(7);
+//        for ( unsigned int ii=0; ii<7; ii++ )
+//        {
+//            A1D(ii) = ii;
+//        }
+//
+//        A1D.print(std::cout);
+//
+//        hoNDArray< std::complex<float> > A1Ds;
+//        hoNDFFT<float>::instance()->ifftshift1D(A1D, A1Ds);
+//        A1Ds.print(std::cout);
+//
+//        hoNDFFT<float>::instance()->fftshift1D(A1D, A1Ds);
+//        A1Ds.print(std::cout);
+//
+//        hoNDFFT<float>::instance()->ifftshift1D(A1Ds, A1D);
+//        A1D.print(std::cout);
+//
+//        hoNDArray< std::complex<float> > AR(A1D);
+//        hoNDFFT<float>::instance()->fft1(A1D, AR);
+//        AR.print(std::cout);
+//
+//        //0 = (7.937254,0.000000)
+//        //1 = (-1.322875,2.746980)
+//        //2 = (-1.322876,1.054958)
+//        //3 = (-1.322875,0.301938)
+//        //4 = (-1.322875,-0.301938)
+//        //5 = (-1.322876,-1.054958)
+//        //6 = (-1.322875,-2.746980)
+//
+//        hoNDFFT<float>::instance()->ifft1(A1D, AR);
+//        AR.print(std::cout);
+//
+//        //0 = (7.937254,0.000000)
+//        //1 = (-1.322875,-2.746980)
+//        //2 = (-1.322876,-1.054958)
+//        //3 = (-1.322875,-0.301938)
+//        //4 = (-1.322875,0.301938)
+//        //5 = (-1.322876,1.054958)
+//        //6 = (-1.322875,2.746980)
+//
+//        hoNDFFT<float>::instance()->fft1c(A1D, AR);
+//        AR.print(std::cout);
+//
+//        //0 = (0.000000,1.356896)
+//        //1 = (0.000000,-1.692022)
+//        //2 = (0.000000,3.048917)
+//        //3 = (7.937254,0.000000)
+//        //4 = (0.000000,-3.048917)
+//        //5 = (0.000000,1.692022)
+//        //6 = (0.000000,-1.356896)
+//
+//        hoNDFFT<float>::instance()->ifft1c(A1D, AR);
+//        AR.print(std::cout);
+//
+//        //0 = (0.000000,-1.356896)
+//        //1 = (0.000000,1.692022)
+//        //2 = (0.000000,-3.048917)
+//        //3 = (7.937254,0.000000)
+//        //4 = (0.000000,3.048917)
+//        //5 = (0.000000,-1.692022)
+//        //6 = (0.000000,1.356896)
+//    }
+//
+//    {
+//        int nx = 5, ny = 6, nz = 3;
+//
+//        ho3DArray< std::complex<float> > A(nx, ny, nz);
+//        A.fill(2.0);
+//        A(1, 4, 2) = std::complex<float>(12, -5.0);
+//        A.print(std::cout);
+//
+//        ho3DArray< std::complex<float> > AR(A);
+//        hoNDFFT<float>::instance()->fft2(AR);
+//        AR.print(std::cout);
+//
+//        //AR(:,:,1) =
+//        //    10.9545         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    AR(:,:,2) =
+//        //    10.9545         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    0         0         0         0         0
+//        //    AR(:,:,3) =
+//        //    12.7802 - 0.9129i  -0.3040 - 2.0185i  -2.0136 - 0.3346i  -0.9405 + 1.8117i   1.4324 + 1.4543i
+//        //    -0.1223 + 2.0376i   1.9001 + 0.7460i   1.2966 - 1.5765i  -1.0987 - 1.7203i  -1.9756 + 0.5133i
+//        //    -1.7034 - 1.1247i  -1.5960 + 1.2725i   0.7170 + 1.9112i   2.0392 - 0.0914i   0.5433 - 1.9676i
+//        //    1.8257 - 0.9129i  -0.3040 - 2.0185i  -2.0136 - 0.3346i  -0.9405 + 1.8117i   1.4324 + 1.4543i
+//        //    -0.1223 + 2.0376i   1.9001 + 0.7460i   1.2966 - 1.5765i  -1.0987 - 1.7203i  -1.9756 + 0.5133i
+//        //    -1.7034 - 1.1247i  -1.5960 + 1.2725i   0.7170 + 1.9112i   2.0392 - 0.0914i   0.5433 - 1.9676i
+//
+//        ho3DArray< std::complex<float> > AR_I(A);
+//        hoNDFFT<float>::instance()->ifft2(AR_I);
+//        AR_I.print(std::cout);
+//
+//        //AR_I(:,:,1) =
+//        //10.9545         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //AR_I(:,:,2) =
+//        //10.9545         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //AR_I(:,:,3) =
+//        //12.7802 - 0.9129i   1.4324 + 1.4543i  -0.9405 + 1.8117i  -2.0136 - 0.3346i  -0.3040 - 2.0185i
+//        //-1.7034 - 1.1247i   0.5433 - 1.9676i   2.0392 - 0.0914i   0.7170 + 1.9112i  -1.5960 + 1.2725i
+//        //-0.1223 + 2.0376i  -1.9756 + 0.5133i  -1.0987 - 1.7203i   1.2966 - 1.5765i   1.9001 + 0.7460i
+//        //1.8257 - 0.9129i   1.4324 + 1.4543i  -0.9405 + 1.8117i  -2.0136 - 0.3346i  -0.3040 - 2.0185i
+//        //-1.7034 - 1.1247i   0.5433 - 1.9676i   2.0392 - 0.0914i   0.7170 + 1.9112i  -1.5960 + 1.2725i
+//        //-0.1223 + 2.0376i  -1.9756 + 0.5133i  -1.0987 - 1.7203i   1.2966 - 1.5765i   1.9001 + 0.7460i
+//
+//        ho3DArray< std::complex<float> > ARc(A);
+//        hoNDFFT<float>::instance()->fft2c(ARc);
+//        ARc.print(std::cout);
+//
+//        //ARc(:,:,1) =
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0   10.9545         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //ARc(:,:,2) =
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0   10.9545         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //ARc(:,:,3) =
+//        //2.0136 + 0.3346i   0.3040 + 2.0185i  -1.8257 + 0.9129i  -1.4324 - 1.4543i   0.9405 - 1.8117i
+//        //1.2966 - 1.5765i   1.9001 + 0.7460i  -0.1223 + 2.0376i  -1.9756 + 0.5133i  -1.0987 - 1.7203i
+//        //-0.7170 - 1.9112i   1.5960 - 1.2725i   1.7034 + 1.1247i  -0.5433 + 1.9676i  -2.0392 + 0.0914i
+//        //-2.0136 - 0.3346i  -0.3040 - 2.0185i  12.7802 - 0.9129i   1.4324 + 1.4543i  -0.9405 + 1.8117i
+//        //-1.2966 + 1.5765i  -1.9001 - 0.7460i   0.1223 - 2.0376i   1.9756 - 0.5133i   1.0987 + 1.7203i
+//        //0.7170 + 1.9112i  -1.5960 + 1.2725i  -1.7034 - 1.1247i   0.5433 - 1.9676i   2.0392 - 0.0914i
+//
+//        ARc = A;
+//        hoNDFFT<float>::instance()->ifft2c(ARc);
+//        ARc.print(std::cout);
+//
+//        //ARc(:,:,1) =
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0   10.9545         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //ARc(:,:,2) =
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //0         0   10.9545         0         0
+//        //0         0         0         0         0
+//        //0         0         0         0         0
+//        //ARc(:,:,3) =
+//        //0.9405 - 1.8117i  -1.4324 - 1.4543i  -1.8257 + 0.9129i   0.3040 + 2.0185i   2.0136 + 0.3346i
+//        //2.0392 - 0.0914i   0.5433 - 1.9676i  -1.7034 - 1.1247i  -1.5960 + 1.2725i   0.7170 + 1.9112i
+//        //1.0987 + 1.7203i   1.9756 - 0.5133i   0.1223 - 2.0376i  -1.9001 - 0.7460i  -1.2966 + 1.5765i
+//        //-0.9405 + 1.8117i   1.4324 + 1.4543i  12.7802 - 0.9129i  -0.3040 - 2.0185i  -2.0136 - 0.3346i
+//        //-2.0392 + 0.0914i  -0.5433 + 1.9676i   1.7034 + 1.1247i   1.5960 - 1.2725i  -0.7170 - 1.9112i
+//        //-1.0987 - 1.7203i  -1.9756 + 0.5133i  -0.1223 + 2.0376i   1.9001 + 0.7460i   1.2966 - 1.5765i
+//    }
+//
+//    {
+//        int nx = 5, ny = 6, nz = 3;
+//
+//        ho3DArray< std::complex<float> > A(nx, ny, nz);
+//        A.fill(2.0);
+//        A(1, 4, 2) = std::complex<float>(12, -5.0);
+//        A.print(std::cout);
+//
+//        ho3DArray< std::complex<float> > AR(A);
+//        hoNDFFT<float>::instance()->fft3(AR);
+//        AR.print(std::cout);
+//
+//        hoNDFFT<float>::instance()->ifft3(AR);
+//        AR.print(std::cout);
+//
+//        ho3DArray< std::complex<float> > AR_I(A);
+//        hoNDFFT<float>::instance()->ifft3(AR_I);
+//        AR_I.print(std::cout);
+//
+//        ho3DArray< std::complex<float> > ARc(A);
+//        hoNDFFT<float>::instance()->fft3c(ARc);
+//        ARc.print(std::cout);
+//
+//        ARc = A;
+//        hoNDFFT<float>::instance()->ifft3c(ARc);
+//        ARc.print(std::cout);
+//    }
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, recon3D)
+//{
+//    typedef GT_Complex8 T;
+//
+//    gtPlusIOAnalyze gt_io;
+//
+//    float v;
+//
+//    std::string debugFolder;
+//
+//    // image data
+//    hoNDArray<GT_Complex8> data;
+//    gt_io.importArrayComplex(data, debugFolder + "data_dst__REAL", 
+//        debugFolder + "data_dst__IMAG");
+//    data.print(std::cout);
+//
+//    GadgetronTimer timer(false);
+//
+//    unsigned int RO = data.get_size(0);
+//    unsigned int E1 = data.get_size(1);
+//    unsigned int E2 = data.get_size(2);
+//    unsigned int CHA = data.get_size(3);
+//
+//    Gadgetron::norm2(data, v);
+//    GADGET_MSG("data = " << v);
+//
+//    hoNDArray<GT_Complex8> Im;
+//    hoNDFFT<float>::instance()->ifft3(data, Im);
+//
+//    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, Im, "Im");
+//
+//    hoNDArray<GT_Complex8> Im2;
+//    timer.start("ifft3c");
+//    hoNDFFT<float>::instance()->ifft3c(data, Im2);
+//    timer.stop();
+//    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, Im2, "Im2");
+//
+//    hoNDArray<GT_Complex8> Im3(RO, E1, 4, CHA);
+//    memcpy(Im3.begin(), Im2.begin(), Im3.get_number_of_bytes());
+//    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, Im3, "Im3");
+//
+//    hoNDArray<T> CoilMap;
+//    timer.start("coilMap3DNIH");
+//    gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(Im3, CoilMap, ISMRMRD_SOUHEIL, 7, 3, true);
+//    timer.stop();
+//    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, CoilMap, "CoilMap");
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, KLTransform)
+//{
+//    typedef GT_Complex8 T;
+//
+//    gtPlusIOAnalyze gt_io;
+//
+//    float v;
+//
+//    // image data
+//    hoNDArray<float> real_data;
+//    std::string filename = this->gtPluse_ut_data_folder_ + "fullkspace_REAL";
+//    gt_io.importArray(real_data, filename);
+//    real_data.print(std::cout);
+//
+//    hoNDArray<float> imag_data;
+//    filename = this->gtPluse_ut_data_folder_ + "fullkspace_IMAG";
+//    gt_io.importArray(imag_data, filename);
+//    imag_data.print(std::cout);
+//
+//    boost::shared_ptr< hoNDArray<GT_Complex8> > tmp = real_imag_to_complex<GT_Complex8>(&real_data, &imag_data);
+//
+//    unsigned int RO = tmp->get_size(0);
+//    unsigned int E1 = tmp->get_size(1);
+//    unsigned int CHA = tmp->get_size(2);
+//    unsigned int PHS = tmp->get_size(3);
+//
+//    hoNDArray<GT_Complex8> kspace(RO, E1, CHA, PHS, tmp->begin());
+//
+//    gtPlusISMRMRDReconUtil<GT_Complex8> util;
+//    gtPlusISMRMRDReconUtilComplex<GT_Complex8> utilCplx;
+//
+//    hoNDArray<GT_Complex8> complexIm;
+//    Gadgetron::hoNDFFT<float>::instance()->ifft2c(kspace, complexIm);
+//
+//    hoNDArray<GT_Complex8> complexImSoS;
+//    utilCplx.sumOfSquare(complexIm, complexImSoS);
+//
+//    gt_io.export3DArrayComplex(complexImSoS, this->gtPluse_ut_res_folder_+"complexImSoS");
+//
+//    unsigned int numOfModes = 10;
+//
+//    hoNDArray<GT_Complex8> complexImKLF;
+//    util.computeKLFilter(complexIm, numOfModes, complexImKLF);
+//
+//    utilCplx.sumOfSquare(complexImKLF, complexImSoS);
+//
+//    gt_io.export3DArrayComplex(complexImSoS, this->gtPluse_ut_res_folder_+"complexImKLFSoS");
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, reconRemoveROOversampling)
+//{
+//    typedef float T;
+//    typedef std::complex<T> TValueType;
+//
+//    gtPlusIOAnalyze gt_io;
+//    GadgetronTimer timer(false);
+//
+//    gtPlusISMRMRDReconUtil<TValueType> util;
+//    gtPlusISMRMRDReconUtilComplex<TValueType> utilCplx;
+//    std::string filename;
+//
+//    hoNDArray<GT_Complex8> data;
+//    gt_io.importArrayComplex(data, this->gtPluse_ut_data_folder_ + "kspace_DownSampleFE_real", this->gtPluse_ut_data_folder_ + "kspace_DownSampleFE_imag");
+//    // real_imag_to_complex<GT_Complex8>(real_data, imag_data, data);
+//    data.print(std::cout);
+//
+//    // export images
+//    hoNDArray<GT_Complex8> complexIm;
+//    Gadgetron::hoNDFFT<T>::instance()->ifft2c(data, complexIm);
+//
+//    hoNDArray<TValueType> sos;
+//    utilCplx.sumOfSquare(complexIm, sos);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "kspace_DownSampleFE_SoS";
+//    gt_io.exportArrayComplex(sos, filename);
+//
+//    // cut down RO oversampling
+//    hoNDArray<TValueType> dataCut;
+//    Gadgetron::hoNDFFT<T>::instance()->ifft1c(data);
+//    utilCplx.cutpad2D(data, data.get_size(0)/2, data.get_size(1), dataCut);
+//    Gadgetron::hoNDFFT<T>::instance()->fft1c(dataCut);
+//
+//    Gadgetron::hoNDFFT<T>::instance()->ifft2c(dataCut, complexIm);
+//    utilCplx.sumOfSquare(complexIm, sos);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "kspace_DownSampleFE_SoS_CutRO";
+//    gt_io.exportArrayComplex(sos, filename);
+//}
+//
+//TYPED_TEST(gtPlus_IO_Test, reconNoisePrewhitening)
+//{
+//    typedef float T;
+//    typedef std::complex<T> TValueType;
+//
+//    gtPlusIOAnalyze gt_io;
+//    GadgetronTimer timer(false);
+//
+//    hoNDArray<float> real_noise;
+//    std::string filename = this->gtPluse_ut_data_folder_ + "Noise_real";
+//    gt_io.importArray(real_noise, filename);
+//    real_noise.print(std::cout);
+//
+//    hoNDArray<float> imag_noise;
+//    filename = this->gtPluse_ut_data_folder_ + "Noise_imag";
+//    gt_io.importArray(imag_noise, filename);
+//    imag_noise.print(std::cout);
+//
+//    ho3DArray<GT_Complex8> noise;
+//    real_imag_to_complex<GT_Complex8>(real_noise, imag_noise, noise);
+//
+//    int COL = noise.get_size(0);
+//    int E1 = noise.get_size(1);
+//    int CHA = noise.get_size(2);
+//
+//    GADGET_MSG(noise(12, 0, 10));
+//
+//    // compute noise prewhitener
+//    double rxDwellTimeData = 2100;
+//    hoMatrix<TValueType> noisePrewhitener(CHA, CHA);
+//
+//    gtPlusISMRMRDReconUtilComplex<TValueType> utilCplx;
+//
+//    double noiseBandWidth = 130;
+//    double receiverBWRatio = 0.79;
+//    double ADCSamplingTimeinSecond = 2100/1e9;
+//
+//    hoMatrix<TValueType> prewhiteningMatrix;
+//
+//    GADGET_START_TIMING(timer, "computeNoisePrewhiteningMatrix");
+//    utilCplx.computeNoisePrewhiteningMatrix(noise, noiseBandWidth, receiverBWRatio, ADCSamplingTimeinSecond, prewhiteningMatrix);
+//    GADGET_STOP_TIMING(timer);
+//    // prewhiteningMatrix.print(std::cout);
+//
+//    EXPECT_NEAR(prewhiteningMatrix(0, 0).real(), 5.1331672e+004, 0.01);
+//    EXPECT_NEAR(prewhiteningMatrix(0, 0).imag(), 0.0, 0.01);
+//
+//    EXPECT_NEAR(prewhiteningMatrix(1, 0).real(), -5791.2319, 0.01);
+//    EXPECT_NEAR(prewhiteningMatrix(1, 0).imag(), -1603.6230, 0.01);
+//
+//    EXPECT_NEAR(prewhiteningMatrix(2, 1).real(), -9597.3955, 0.01);
+//    EXPECT_NEAR(prewhiteningMatrix(2, 1).imag(), 4500.7114, 0.01);
+//
+//    EXPECT_NEAR(prewhiteningMatrix(4, 3).real(), -7718.3286, 0.01);
+//    EXPECT_NEAR(prewhiteningMatrix(4, 3).imag(), -3565.7336, 0.01);
+//
+//    EXPECT_NEAR(prewhiteningMatrix(31, 31).real(), 60350.840, 0.01);
+//    EXPECT_NEAR(prewhiteningMatrix(31, 31).imag(), 0.0, 0.01);
+//
+//    /// load the data scan
+//    hoNDArray<float> real_data;
+//    filename = this->gtPluse_ut_data_folder_ + "noisePrewhitening_DataScan_real";
+//    gt_io.importArray(real_data, filename);
+//    real_data.print(std::cout);
+//
+//    hoNDArray<float> imag_data;
+//    filename = this->gtPluse_ut_data_folder_ + "noisePrewhitening_DataScan_imag";
+//    gt_io.importArray(imag_data, filename);
+//    imag_data.print(std::cout);
+//
+//    ho3DArray<GT_Complex8> data;
+//    real_imag_to_complex<GT_Complex8>(real_data, imag_data, data);
+//
+//    GADGET_MSG(data(42, 12, 10));
+//
+//    // apply the noise matrix
+//    GADGET_START_TIMING(timer, "performNoisePrewhitening");
+//    utilCplx.performNoisePrewhitening(data, prewhiteningMatrix);
+//    GADGET_STOP_TIMING(timer);
+//    GADGET_MSG(data(42, 12, 10));
+//    EXPECT_LE(std::abs(data(42, 12, 10)-TValueType(-0.068069, -0.185625)), 1e-6);
+//}
+//
+//
+//TYPED_TEST(gtPlus_IO_Test, IOTest)
+//{
+//    typedef GT_Complex8 T;
+//
+//    gtPlusIOAnalyze gt_io;
+//
+//    hoNDArray<float> real_Im;
+//    std::string filename = this->gtPluse_ut_data_folder_ + "KSpaceBinning_IncomingKSpace_real";
+//    gt_io.importArray(real_Im, filename);
+//    real_Im.print(std::cout);
+//
+//    hoNDArray<float> imag_Im;
+//    filename = this->gtPluse_ut_data_folder_ + "KSpaceBinning_IncomingKSpace_imag";
+//    gt_io.importArray(imag_Im, filename);
+//    imag_Im.print(std::cout);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_real2";
+//    gt_io.exportArray(real_Im, filename);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_imag2";
+//    gt_io.exportArray(imag_Im, filename);
+//
+//    boost::shared_ptr< hoNDArray<GT_Complex8> > tmp = real_imag_to_complex<GT_Complex8>(&real_Im, &imag_Im);
+//
+//    unsigned int RO = tmp->get_size(0);
+//    unsigned int E1 = tmp->get_size(1);
+//    unsigned int CHA = tmp->get_size(2);
+//    unsigned int PHS = tmp->get_size(3);
+//
+//    hoNDArray<GT_Complex8> kspace(RO, E1, CHA, PHS, tmp->begin());
+//
+//    float nrm2;
+//    Gadgetron::norm2(kspace, nrm2);
+//    GADGET_MSG("nrm2 = " << nrm2);
+//
+//    gtPlusISMRMRDReconUtil<GT_Complex8> util;
+//    gtPlusISMRMRDReconUtilComplex<GT_Complex8> utilCplx;
+//
+//    // sum of square
+//    hoNDArray<GT_Complex8> complexIm, sosIm;
+//
+//    GadgetronTimer timer(false);
+//    timer.start("ifft2c");
+//    hoNDFFT<float>::instance()->ifft2c(kspace, complexIm);
+//    timer.stop();
+//
+//    timer.start("sumOfSquare");
+//    utilCplx.sumOfSquare(complexIm, sosIm);
+//    timer.stop();
+//
+//    hoNDArray<float> magSoS;
+//    timer.start("absolute");
+//    Gadgetron::absolute(sosIm, magSoS);
+//    timer.stop();
+//
+//    filename = this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_SoS";
+//    gt_io.exportArray(magSoS, filename);
+//
+//    // coil map estimation
+//
+//    hoNDArray<GT_Complex8> meanKSpace;
+//    sumOverLastDimension(kspace, meanKSpace);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_mean";
+//    gt_io.export3DArrayComplex(meanKSpace, filename);
+//
+//    Gadgetron::norm2(meanKSpace, nrm2);
+//    GADGET_MSG("nrm2 = " << nrm2);
+//
+//    hoNDArray<GT_Complex8> meanIm;
+//    hoNDFFT<float>::instance()->ifft2c(meanKSpace, meanIm);
+//    Gadgetron::norm2(meanIm, nrm2);
+//    GADGET_MSG("nrm2 = " << nrm2);
+//
+//    filename = this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_meanIm";
+//    gt_io.export3DArrayComplex(meanIm, filename);
+//
+//    hoNDArray<GT_Complex8> coilMap;
+//    timer.start("coilMap2DNIH");
+//    utilCplx.coilMap2DNIH(meanIm, coilMap, ISMRMRD_SOUHEIL, 7, 3);
+//    timer.stop();
+//
+//    filename = this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_meanIm_coilMap";
+//    gt_io.export3DArrayComplex(coilMap, filename);
+//
+//    hoNDArray<GT_Complex8> combined;
+//    timer.start("coilCombine");
+//    utilCplx.coilCombine(meanIm, coilMap, combined);
+//    timer.stop();
+//
+//    gt_io.export3DArrayComplex(combined, this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_meanIm_coilMap_combined");
+//
+//    // KLT
+//    hoMatrix<T> coeff, eigenValues;
+//    timer.start("computeKLTCoeff");
+//    util.computeKLTCoeff(meanKSpace, coeff, eigenValues);
+//    timer.stop();
+//    eigenValues.print(std::cout);
+//
+//    double thres = 0.001;
+//    timer.start("computeKLCoilCompressionCoeff, thres");
+//    util.computeKLCoilCompressionCoeff(meanKSpace, thres, coeff, eigenValues);
+//    timer.stop();
+//    eigenValues.print(std::cout);
+//
+//    hoNDArray<T> dataEigen;
+//    int numOfModeKept = 20;
+//    util.computeKLCoilCompression(meanKSpace, numOfModeKept, coeff, eigenValues, dataEigen);
+//    Gadgetron::norm2(dataEigen, nrm2);
+//    GADGET_MSG("nrm2 = " << nrm2);
+//
+//    hoNDFFT<float>::instance()->ifft2c(dataEigen, meanIm);
+//    gt_io.export3DArrayComplex(meanIm, this->gtPluse_ut_res_folder_ + "KSpaceBinning_IncomingKSpace_meanIm_dataEigen");
+//}
diff --git a/toolboxes/gtplus/util/gtPlusIOAnalyze.cpp b/toolboxes/gtplus/util/gtPlusIOAnalyze.cpp
new file mode 100644
index 0000000..43571a8
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOAnalyze.cpp
@@ -0,0 +1,252 @@
+/** \file       gtPlusIOAnalyze.cpp
+    \brief      Implement the suppor for the Analzye75 medical image format
+    \author     Hui Xue
+
+    Ref to:
+    http://eeg.sourceforge.net/ANALYZE75.pdf
+*/
+
+#include <gtPlusIOAnalyze.h>
+
+// to suppor the ISMRMRD format
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py)
+{
+    pixelSize_.resize(2);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz)
+{
+    pixelSize_.resize(3);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt)
+{
+    pixelSize_.resize(4);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+    pixelSize_[3] = pt;
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr)
+{
+    pixelSize_.resize(5);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+    pixelSize_[3] = pt;
+    pixelSize_[4] = pr;
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps)
+{
+    pixelSize_.resize(6);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+    pixelSize_[3] = pt;
+    pixelSize_[4] = pr;
+    pixelSize_[5] = ps;
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp)
+{
+    pixelSize_.resize(7);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+    pixelSize_[3] = pt;
+    pixelSize_[4] = pr;
+    pixelSize_[5] = ps;
+    pixelSize_[6] = pp;
+}
+
+gtPlusIOAnalyze::gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp, float pq)
+{
+    pixelSize_.resize(8);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+    pixelSize_[3] = pt;
+    pixelSize_[4] = pr;
+    pixelSize_[5] = ps;
+    pixelSize_[6] = pp;
+    pixelSize_[7] = pq;
+}
+
+void gtPlusIOAnalyze::setPixelSize(float px, float py, float pz, float pt, float pr, float ps, float pp, float pq)
+{
+    pixelSize_.resize(8);
+    pixelSize_[0] = px;
+    pixelSize_[1] = py;
+    pixelSize_[2] = pz;
+    pixelSize_[3] = pt;
+    pixelSize_[4] = pr;
+    pixelSize_[5] = ps;
+    pixelSize_[6] = pp;
+    pixelSize_[7] = pq;
+}
+
+void gtPlusIOAnalyze::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus Array input/output to Analyze75 format -------------" << endl;
+    os << "--------------------------------------------------------------------------" << endl;
+}
+
+std::string gtPlusIOAnalyze::getRTTIFromAnalyzeDataType(AnalyzeDataType aDT)
+{
+    std::string rttiID;
+
+    switch (aDT)
+    {
+    case DT_UNSIGNED_CHAR :
+        rttiID = typeid(unsigned char).name();
+        break;
+
+    case DT_SIGNED_SHORT :
+        rttiID = typeid(short).name();
+        break;
+
+    case DT_UNSIGNED_SHORT :
+        rttiID = typeid(unsigned short).name();
+        break;
+
+    case DT_SIGNED_INT :
+        rttiID = typeid(int).name();
+        break;
+
+    case DT_UNSIGNED_INT :
+        rttiID = typeid(size_t).name();
+        break;
+
+    case DT_FLOAT :
+        rttiID = typeid(float).name();
+        break;
+
+    case DT_DOUBLE :
+        rttiID = typeid(double).name();
+        break;
+
+    case DT_COMPLEX :
+        rttiID = typeid(GT_Complex8).name();
+        break;
+
+    case DT_DOUBLECOMPLEX :
+        rttiID = typeid(GT_Complex16).name();
+        break;
+
+    default:
+        rttiID = "UNKOWN TYPE";
+    }
+
+    return rttiID;
+}
+
+AnalyzeDataType gtPlusIOAnalyze::getAnalyzeDataTypeFromRTTI(const std::string& name)
+{
+    AnalyzeDataType analyzeDT = DT_ANA_UNKNOWN;
+
+    if ( name == typeid(unsigned char).name() )
+    {
+        analyzeDT = DT_UNSIGNED_CHAR;
+    }
+
+    if ( name == typeid(short).name() )
+    {
+        analyzeDT = DT_SIGNED_SHORT;
+    }
+
+    if ( name == typeid(unsigned short).name() )
+    {
+        analyzeDT = DT_UNSIGNED_SHORT;
+    }
+
+    if ( name == typeid(int).name() )
+    {
+        analyzeDT = DT_SIGNED_INT;
+    }
+
+    if ( name == typeid(size_t).name() )
+    {
+        analyzeDT = DT_UNSIGNED_INT;
+    }
+
+    if ( name == typeid(float).name() )
+    {
+        analyzeDT = DT_FLOAT;
+    }
+
+    if ( name == typeid(double).name() )
+    {
+        analyzeDT = DT_DOUBLE;
+    }
+
+    if ( name == typeid(GT_Complex8).name() )
+    {
+        analyzeDT = DT_COMPLEX;
+    }
+
+    if ( name == typeid(GT_Complex16).name() )
+    {
+        analyzeDT = DT_DOUBLECOMPLEX;
+    }
+
+    return analyzeDT;
+}
+
+bool gtPlusIOAnalyze::readAnalyzeHeader(const std::string& filename, dsr& header)
+{
+    try
+    {
+        std::string filenameData = filename;
+        filenameData.append(".hdr");
+
+        gtPlusIOWorker ioworker(filenameData, true);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker.open());
+        GADGET_CHECK_RETURN_FALSE(ioworker.read(reinterpret_cast<char*>(&header), sizeof(dsr)));
+        GADGET_CHECK_RETURN_FALSE(ioworker.close());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::readAnalyzeHeader(const std::string& filename, dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOAnalyze::writeAnalyzeHeader(const std::string& filename, const dsr& header)
+{
+    try
+    {
+        std::string filenameData = filename;
+        filenameData.append(".hdr");
+
+        gtPlusIOWorker ioworker(filenameData, false);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker.open());
+        GADGET_CHECK_RETURN_FALSE(ioworker.write(reinterpret_cast<const char*>(&header), sizeof(dsr)));
+        GADGET_CHECK_RETURN_FALSE(ioworker.close());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::writeAnalyzeHeader(const std::string& filename, const dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusIOAnalyze.h b/toolboxes/gtplus/util/gtPlusIOAnalyze.h
new file mode 100644
index 0000000..4cfe5f9
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOAnalyze.h
@@ -0,0 +1,652 @@
+/** \file       gtPlusIOAnalyze.h
+    \brief      Implement the suppor for the Analzye75 medical image format
+    \author     Hui Xue
+
+    The ISMRMRD dimensions are mapped to Analyze75 format.
+
+    Ref to:
+    http://eeg.sourceforge.net/ANALYZE75.pdf
+    http://ismrmrd.sourceforge.net/
+*/
+
+#pragma once
+
+#include "gtPlusIOBase.h"
+
+// the file input/output utility functions for the Analyze format
+
+// the following Analyze75 data structured is defined as this online document eeg.sourceforge.net/ANALYZE75.pdf‎
+
+enum AnalyzeDataType
+{
+    DT_ANA_UNKNOWN=0,
+    DT_BINARY=1, 
+    DT_UNSIGNED_CHAR=2,
+    DT_SIGNED_SHORT=4,
+    DT_UNSIGNED_SHORT=5,
+    DT_SIGNED_INT=8,
+    DT_UNSIGNED_INT=9,
+    DT_FLOAT=16,
+    DT_COMPLEX=32,
+    DT_DOUBLE=64,
+    DT_DOUBLECOMPLEX=96, // this type is added to support complex doulbe
+    DT_RGB=128,
+    DT_ALL=255
+};
+
+// the official definition of Analyze 7.5 file format
+struct header_key
+{
+    int sizeof_hdr;
+    char data_type[10];
+    char db_name[18];
+    int extents;
+    short int session_error;
+    char regular;
+    char hkey_un0;
+};
+
+struct image_dimension
+{
+    short int dim[8];
+    short int unused8;
+    short int unused9;
+    short int unused10;
+    short int unused11;
+    short int unused12;
+    short int unused13;
+    short int unused14;
+    short int datatype;
+    short int bitpix;
+    short int dim_un0;
+    float pixdim[8];
+    float vox_offset;
+    float funused1;
+    float funused2;
+    float funused3;
+    float cal_max;
+    float cal_min;
+    float compressed;
+    float verified;
+    int glmax,glmin;
+};
+
+struct data_history
+{
+    char descrip[80];
+    char aux_file[24];
+    char orient;
+    char originator[10];
+    char generated[10];
+    char scannum[10];
+    char patient_id[10];
+    char exp_date[10];
+    char exp_time[10];
+    char hist_un0[3];
+    int views;
+    int vols_added;
+    int start_field;
+    int field_skip;
+    int omax, omin;
+    int smax, smin;
+};
+
+// Analyze75 header has 348 bytes
+struct dsr
+{
+    struct header_key hk;
+    struct image_dimension dime;
+    struct data_history hist;
+};
+
+// to suppor the ISMRMRD format
+// [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+
+namespace Gadgetron { namespace gtPlus {
+
+class EXPORTGTPLUS gtPlusIOAnalyze
+{
+public:
+
+    gtPlusIOAnalyze() { pixelSize_.resize(10, 1.0); }
+    gtPlusIOAnalyze(float px, float py);
+    gtPlusIOAnalyze(float px, float py, float pz);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp);
+    gtPlusIOAnalyze(float px, float py, float pz, float pt, float pr, float ps, float pp, float pq);
+
+    void setPixelSize(float px, float py, float pz=1.0f, float pt=1.0f, float pr=1.0f, float ps=1.0f, float pp=1.0f, float pq=1.0f);
+
+    virtual ~gtPlusIOAnalyze() {}
+
+public:
+
+    void printInfo(std::ostream& os);
+
+    // export/input for 2D/3D/4D array
+    // filename should be given without .hdr extension
+    // the .hdr and .img extension will be added internally
+
+    template <typename T> bool exportArray(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool importArray(hoNDArray<T>& a, const std::string& filename);
+
+    template <typename T> bool exportArrayComplex(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool importArrayComplex(hoNDArray<T>& a, const std::string& filename);
+
+    template <typename T> bool importArrayComplex(hoNDArray<T>& a, const std::string& filename_real, const std::string& filename_imag);
+
+    // 2D array is exported as a 2D image
+    template <typename T> bool export2DArray(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool import2DArray(hoNDArray<T>& a, const std::string& filename);
+
+    template <typename T> bool export2DArrayComplex(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool import2DArrayComplex(hoNDArray<T>& a, const std::string& filename);
+
+    // 3D array is exported as a 3D volume
+    template <typename T> bool export3DArray(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool import3DArray(hoNDArray<T>& a, const std::string& filename);
+
+    template <typename T> bool export3DArrayComplex(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool import3DArrayComplex(hoNDArray<T>& a, const std::string& filename);
+
+    // 4D array is exported as multiple 3D volume
+    template <typename T> bool export4DArray(const hoNDArray<T>& a, const std::string& filename);
+    template <typename T> bool export4DArrayComplex(const hoNDArray<T>& a, const std::string& filename);
+
+protected:
+
+    std::vector<float> pixelSize_;
+
+    template <typename T> bool array2Analyze(const hoNDArray<T>& a, dsr& header);
+    template <typename T> bool analyze2Array(hoNDArray<T>& a, const dsr& header);
+
+    // get the run-time type ID from analyze data type or vice versa
+    std::string getRTTIFromAnalyzeDataType(AnalyzeDataType aDT);
+    AnalyzeDataType getAnalyzeDataTypeFromRTTI(const std::string& name);
+
+    // read/write the analyze header
+    bool readAnalyzeHeader(const std::string& filename, dsr& header);
+    bool writeAnalyzeHeader(const std::string& filename, const dsr& header);
+
+    // read/write the analyze data file
+    // len is the number of bytes
+    template <typename T> bool readAnalyzeData(const std::string& filename, T* data, long long len);
+    template <typename T> bool writeAnalyzeData(const std::string& filename, const T* data, long long len);
+};
+
+template <typename T> 
+bool gtPlusIOAnalyze::exportArray(const hoNDArray<T>& a, const std::string& filename)
+{
+    try
+    {
+        dsr header;
+        GADGET_CHECK_RETURN_FALSE(array2Analyze(a, header));
+        GADGET_CHECK_RETURN_FALSE(writeAnalyzeHeader(filename, header));
+        GADGET_CHECK_RETURN_FALSE(writeAnalyzeData(filename, a.begin(), a.get_number_of_bytes()));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::exportArray(const hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::importArray(hoNDArray<T>& a, const std::string& filename)
+{
+    try
+    {
+        dsr header;
+        GADGET_CHECK_RETURN_FALSE(readAnalyzeHeader(filename, header));
+        GADGET_CHECK_RETURN_FALSE(analyze2Array(a, header));
+        GADGET_CHECK_RETURN_FALSE(readAnalyzeData(filename, a.begin(), a.get_number_of_bytes()));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::importArray(const hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::exportArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+{
+    try
+    {
+        typedef typename Gadgetron::realType<T>::Type value_type;
+
+        hoNDArray<value_type> buf;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::complex_to_real(a, buf));
+
+        std::string filenameReal = filename;
+        filenameReal.append("_REAL");
+        GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameReal));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::complex_to_imag(a, buf));
+        std::string filenameImag = filename;
+        filenameImag.append("_IMAG");
+        GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameImag));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(a, buf));
+        std::string filenameMag = filename;
+        filenameMag.append("_MAG");
+        GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenameMag));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::argument(a, buf));
+        std::string filenamePhase = filename;
+        filenamePhase.append("_PHASE");
+        GADGET_CHECK_RETURN_FALSE(exportArray(buf, filenamePhase));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::exportArrayComplex(const hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::importArrayComplex(hoNDArray<T>& a, const std::string& filename)
+{
+    try
+    {
+        typedef typename T::value_type value_type;
+        hoNDArray<value_type> real, imag;
+
+        std::string filenameReal = filename;
+        filenameReal.append("_REAL");
+        GADGET_CHECK_RETURN_FALSE(importArray(real, filenameReal));
+
+        std::string filenameImag = filename;
+        filenameImag.append("_IMAG");
+        GADGET_CHECK_RETURN_FALSE(importArray(imag, filenameImag));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::real_imag_to_complex(real, imag, a));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::importArrayComplex(const hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::importArrayComplex(hoNDArray<T>& a, const std::string& filename_real, const std::string& filename_imag)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+        hoNDArray<value_type> real, imag;
+
+        GADGET_CHECK_RETURN_FALSE(importArray(real, filename_real));
+        GADGET_CHECK_RETURN_FALSE(importArray(imag, filename_imag));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::real_imag_to_complex(real, imag, a));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::importArrayComplex(hoNDArray<T>& a, const std::string& filename_real, const std::string& filename_imag) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::export2DArray(const hoNDArray<T>& a, const std::string& filename)
+{
+    return exportArray(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::import2DArray(hoNDArray<T>& a, const std::string& filename)
+{
+    return importArray(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::export2DArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+{
+    return exportArrayComplex(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::import2DArrayComplex(hoNDArray<T>& a, const std::string& filename)
+{
+    return importArrayComplex(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::export3DArray(const hoNDArray<T>& a, const std::string& filename)
+{
+    return exportArray(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::import3DArray(hoNDArray<T>& a, const std::string& filename)
+{
+    return importArray(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::export3DArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+{
+    return exportArrayComplex(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::import3DArrayComplex(hoNDArray<T>& a, const std::string& filename)
+{
+    return importArrayComplex(a, filename);
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::export4DArray(const hoNDArray<T>& a, const std::string& filename)
+{
+    try
+    {
+        size_t RO     = a.get_size(0);
+        size_t E1     = a.get_size(1);
+        size_t CHA    = a.get_size(2);
+        size_t N      = a.get_size(3);
+
+        size_t ii;
+        for (ii=0; ii<N; ii++ )
+        {
+            std::vector<size_t> dim(3);
+            dim[0] = RO;
+            dim[1] = E1;
+            dim[2] = CHA;
+
+            boost::shared_ptr< std::vector<size_t> > sDim(&dim);
+            hoNDArray<T> a3D(sDim, const_cast<T*>(a.begin()+ii*RO*E1*CHA), false);
+
+            std::ostringstream ostr;
+            ostr << filename << "_" << ii << std::ends;
+            GADGET_CHECK_RETURN_FALSE(export3DArray(a3D, ostr.str()));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::export4DArray(const hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::export4DArrayComplex(const hoNDArray<T>& a, const std::string& filename)
+{
+    try
+    {
+        size_t RO     = a.get_size(0);
+        size_t E1     = a.get_size(1);
+        size_t CHA    = a.get_size(2);
+        size_t N      = a.get_size(3);
+
+        size_t ii;
+        for (ii=0; ii<N; ii++ )
+        {
+            std::vector<size_t> dim(3);
+            dim[0] = RO;
+            dim[1] = E1;
+            dim[2] = CHA;
+
+            boost::shared_ptr< std::vector<size_t> > sDim(&dim);
+            hoNDArray<T> a3D(sDim, const_cast<T*>(a.begin()+ii*RO*E1*CHA), false);
+
+            std::ostringstream ostr;
+            ostr << filename << "_" << ii << std::ends;
+            GADGET_CHECK_RETURN_FALSE(export3DArrayComplex(a3D, ostr.str()));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::export4DArrayComplex(const hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::array2Analyze(const hoNDArray<T>& a, dsr& header)
+{
+    try
+    {
+        // set everything to zero
+        memset(&header, 0, sizeof(dsr));
+
+        // header_key
+        header.hk.sizeof_hdr = 348;
+        size_t i;
+        for (i=0; i<10; i++ ) header.hk.data_type[i] = 0;
+        for (i=0; i<18; i++ ) header.hk.db_name[i] = 0;
+        header.hk.extents = 16384;
+        header.hk.session_error = 0;
+        header.hk.regular = 'r';
+        header.hk.hkey_un0 = 0;
+
+        // image_dimension
+        size_t NDim = a.get_number_of_dimensions();
+
+        header.dime.dim[0] = (short)(NDim);
+        header.dime.dim[1] = (short)(a.get_size(0));
+
+        if ( NDim > 1 )
+            header.dime.dim[2] = (short)(a.get_size(1));
+        else
+            header.dime.dim[2] = 1;
+
+        if ( NDim > 2 )
+            header.dime.dim[3] = (short)(a.get_size(2));
+        else
+            header.dime.dim[3] = 1;
+
+        if ( NDim > 3 )
+            header.dime.dim[4] = (short)(a.get_size(3));
+        else
+            header.dime.dim[4] = 1;
+
+        if ( NDim > 4 )
+            header.dime.dim[5] = (short)(a.get_size(4));
+        else
+            header.dime.dim[5] = 1;
+
+        if ( NDim > 5 )
+            header.dime.dim[6] = (short)(a.get_size(5));
+        else
+            header.dime.dim[6] = 1;
+
+        if ( NDim > 6 )
+            header.dime.dim[7] = (short)(a.get_size(6));
+        else
+            header.dime.dim[7] = 1;
+
+        if ( NDim > 7 )
+            header.dime.unused8 = (short)(a.get_size(7));
+        else
+            header.dime.unused8 = 1;
+
+        if ( NDim > 8 )
+            header.dime.unused9 = (short)(a.get_size(8));
+        else
+            header.dime.unused9 = 1;
+
+        if ( NDim > 9 )
+            header.dime.unused10 = (short)(a.get_size(9));
+        else
+            header.dime.unused10 = 1;
+
+        header.dime.unused11 = 0;
+        header.dime.unused12 = 0;
+        header.dime.unused13 = 0;
+        header.dime.unused14 = 0;
+
+        std::string rttiID = std::string(typeid(T).name());
+        header.dime.datatype = (short)getAnalyzeDataTypeFromRTTI(rttiID);
+        header.dime.bitpix = (short)(8*sizeof(T));
+        header.dime.dim_un0 = 0;
+
+        // since the NDArray does not carry the pixel spacing
+        header.dime.pixdim[0] = 0;
+        if ( pixelSize_.size() > 1 )
+            header.dime.pixdim[1] = pixelSize_[0];
+        if ( pixelSize_.size() > 2 )
+            header.dime.pixdim[2] = pixelSize_[1];
+        if ( pixelSize_.size() > 3 )
+            header.dime.pixdim[3] = pixelSize_[2];
+        if ( pixelSize_.size() > 4 )
+            header.dime.pixdim[4] = pixelSize_[3];
+        if ( pixelSize_.size() > 5 )
+            header.dime.pixdim[5] = pixelSize_[4];
+        if ( pixelSize_.size() > 6 )
+            header.dime.pixdim[6] = pixelSize_[5];
+        if ( pixelSize_.size() > 7 )
+            header.dime.pixdim[7] = pixelSize_[6];
+
+        header.dime.vox_offset = 0;
+        header.dime.funused1 = 0;
+        header.dime.funused2 = 0;
+        header.dime.funused3 = 0;
+        header.dime.cal_max = 0;
+        header.dime.cal_min = 0;
+        header.dime.compressed = 0;
+        header.dime.verified = 0;
+        header.dime.glmax = 0;
+        header.dime.glmin = 0;
+
+        // data history
+        for (i=0; i<80; i++ ) header.hist.descrip[i] = 0;
+        for (i=0; i<24; i++ ) header.hist.aux_file[i] = 0;
+        header.hist.orient = 0;
+        for (i=0; i<10; i++ ) header.hist.originator[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.generated[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.scannum[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.patient_id[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.exp_date[i] = 0;
+        for (i=0; i<10; i++ ) header.hist.exp_time[i] = 0;
+        for (i=0; i<3; i++ ) header.hist.hist_un0[i] = 0;
+        header.hist.views = 0;
+        header.hist.vols_added = 0;
+        header.hist.start_field = 0;
+        header.hist.field_skip = 0;
+        header.hist.omax = 0;
+        header.hist.omin = 0;
+        header.hist.smax = 0;
+        header.hist.smin = 0;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::array2Analyze(const hoNDArray<T>& a, dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::analyze2Array(hoNDArray<T>& a, const dsr& header)
+{
+    try
+    {
+        std::string rttiID = std::string(typeid(T).name());
+        GADGET_CHECK_RETURN_FALSE(rttiID==getRTTIFromAnalyzeDataType( (AnalyzeDataType)header.dime.datatype));
+
+        std::vector<size_t> dim(header.dime.dim[0]);
+        size_t ii;
+        for ( ii=0; ii<dim.size(); ii++ )
+        {
+            if ( ii == 7 )
+            {
+                dim[ii] = header.dime.unused8;
+            }
+            else if ( ii == 8 )
+            {
+                dim[ii] = header.dime.unused9;
+            }
+            else if ( ii == 9 ) 
+            {
+                dim[ii] = header.dime.unused10;
+            }
+            else
+            {
+                dim[ii] = header.dime.dim[ii+1];
+            }
+        }
+
+        pixelSize_.resize(dim.size());
+        for ( ii=0; ii<dim.size(); ii++ )
+        {
+            if ( ii < 7 )
+            {
+                pixelSize_[ii] = header.dime.pixdim[ii+1];
+            }
+        }
+
+        a.create(&dim);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::analyze2Array(hoNDArray<T>& a, const dsr& header) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::readAnalyzeData(const std::string& filename, T* data, long long len)
+{
+    try
+    {
+        std::string filenameData = filename;
+        filenameData.append(".img");
+        gtPlusIOWorker ioworker(filenameData, true);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker.open());
+        GADGET_CHECK_RETURN_FALSE(ioworker.read(reinterpret_cast<char*>(data), len));
+        GADGET_CHECK_RETURN_FALSE(ioworker.close());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::readAnalyzeData(const std::string& filename, T* data, long long len) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOAnalyze::writeAnalyzeData(const std::string& filename, const T* data, long long len)
+{
+    try
+    {
+        std::string filenameData = filename;
+        filenameData.append(".img");
+        gtPlusIOWorker ioworker(filenameData, false);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker.open());
+        GADGET_CHECK_RETURN_FALSE(ioworker.write(reinterpret_cast<const char*>(data), len));
+        GADGET_CHECK_RETURN_FALSE(ioworker.close());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOAnalyze::writeAnalyzeData(const std::string& filename, const T* data, long long len) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusIOBase.cpp b/toolboxes/gtplus/util/gtPlusIOBase.cpp
new file mode 100644
index 0000000..b08c445
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOBase.cpp
@@ -0,0 +1,200 @@
+/** \file       gtPlusIOBase.cpp
+    \brief      Define the base IO funcatinality for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#include <gtPlusIOBase.h>
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusIOWorker::gtPlusIOWorker(const std::string& ioTag, bool readFlag) : ioTag_(ioTag), readFlag_(readFlag)
+{
+}
+
+gtPlusIOWorker::~gtPlusIOWorker()
+{
+    if ( !close() )
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOWorker::~gtPlusIOWorker() ... ");
+    }
+}
+
+bool gtPlusIOWorker::open()
+{
+    try
+    {
+        if ( fid_.is_open() )
+        {
+            fid_.close();
+        }
+
+        if ( readFlag_ )
+        {
+            fid_.open(ioTag_.c_str(), std::ios::in | std::ios::binary);
+        }
+        else
+        {
+            fid_.open(ioTag_.c_str(), std::ios::out | std::ios::binary);
+        }
+
+        if ( !fid_ )
+        {
+            GADGET_ERROR_MSG("gtPlusIOWorker::open() cannot open file stream : " << ioTag_);
+            return false;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOWorker::open() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOWorker::close()
+{
+    try
+    {
+        if ( fid_.is_open() )
+        {
+            fid_.close();
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOWorker::close() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+long gtPlusIOWorker::tell()
+{
+    if ( !fid_.is_open() ) return -1;
+
+    if ( readFlag_ )
+    {
+        return fid_.tellg();
+    }
+
+    return fid_.tellp();
+}
+
+bool gtPlusIOWorker::seek(long long offset)
+{
+    if ( !fid_.is_open() ) return false;
+
+    if ( readFlag_ )
+    {
+        fid_.seekg(offset, std::ios::beg);
+        return this->IOinError();
+    }
+
+    fid_.seekp(offset, std::ios::beg);
+    return this->IOinError();
+}
+
+bool gtPlusIOWorker::IOinError()
+{
+    std::ios::iostate s;
+    s = fid_.rdstate();
+
+    if ( (s&std::ios::failbit) || (s&std::ios::badbit) )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOWorker::read(char* data, long long len)
+{
+    if ( !fid_.is_open() ) return false;
+    fid_.read(data, len*sizeof(char));
+    return IOinError();
+}
+
+bool gtPlusIOWorker::write(const char* data, long long len)
+{
+    if ( !fid_.is_open() ) return false;
+    fid_.write(data, len*sizeof(char));
+    return IOinError();
+}
+
+// --------------------------------------------------------------------------
+
+void gtPlusIOBase::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus IO Util ---------------" << endl;
+    os << "Implementation of file input/output operations" << endl;
+    os << "---------------------------------------------" << endl;
+}
+
+bool gtPlusIOBase::readFromFile(const std::string& filename, char*& data, long long& length)
+{
+    try
+    {
+        if (data!=NULL) delete [] data;
+
+        gtPlusIOWorker ioworker_(filename, true);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker_.open());
+
+        // read the total length
+        long long totalLen;
+        GADGET_CHECK_RETURN_FALSE(ioworker_.read(reinterpret_cast<char*>(&totalLen), sizeof(long long)));
+
+        length = totalLen - sizeof(long long);
+
+        data = new char[length];
+        GADGET_CHECK_RETURN_FALSE(data!=NULL);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker_.read(data, length));
+
+        GADGET_CHECK_RETURN_FALSE(ioworker_.close());
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOBase::readFromFile(const std::string& filename, char*& data, long long& length) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+bool gtPlusIOBase::writeToFile(const std::string& filename, char* data, long long length)
+{
+    try
+    {
+        if ( length == 0 ) return true;
+
+        GADGET_CHECK_RETURN_FALSE(data!=NULL);
+
+        gtPlusIOWorker ioworker_(filename, false);
+
+        GADGET_CHECK_RETURN_FALSE(ioworker_.open());
+
+        // write the total lengh
+        const long long totalLen = length+sizeof(long long);
+        GADGET_CHECK_RETURN_FALSE(ioworker_.write(reinterpret_cast<const char*>(&totalLen), sizeof(long long)));
+
+        // write the data
+        GADGET_CHECK_RETURN_FALSE(ioworker_.write(data, length));
+
+        // close the file
+        GADGET_CHECK_RETURN_FALSE(ioworker_.close());
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOBase::writeToFile(const std::string& filename, char* data, long long length) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusIOBase.h b/toolboxes/gtplus/util/gtPlusIOBase.h
new file mode 100644
index 0000000..59cc02f
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusIOBase.h
@@ -0,0 +1,129 @@
+/** \file       gtPlusIOBase.h
+    \brief      Define the base IO funcatinality for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include <iostream>
+#include <typeinfo>
+
+#include "GtPlusExport.h"
+#include "NDArray.h"
+#include "complext.h"
+#include "vector_td.h"
+#include "GadgetronException.h"
+#include "GadgetronCommon.h"
+
+#include <mkl.h>
+
+#include "hoNDArray.h"
+#include "ho2DArray.h"
+#include "ho3DArray.h"
+#include "ho4DArray.h"
+#include "ho5DArray.h"
+#include "ho6DArray.h"
+#include "ho7DArray.h"
+
+#include "hoNDArray_fileio.h"
+#include "hoNDArray_elemwise.h"
+
+// the file input/output utility functions
+
+#ifdef GT_Complex8
+    #undef GT_Complex8
+#endif // GT_Complex8
+typedef std::complex<float> GT_Complex8;
+
+#ifdef GT_Complex16
+    #undef GT_Complex16
+#endif // GT_Complex16
+typedef std::complex<double> GT_Complex16;
+
+namespace Gadgetron { namespace gtPlus {
+
+class EXPORTGTPLUS gtPlusIOWorker
+{
+public:
+
+    gtPlusIOWorker(const std::string& ioTag, bool readFlag=true);
+    virtual ~gtPlusIOWorker();
+
+    // open the file stream
+    // readFlag: true, read mode; false, write mode
+    virtual bool open();
+
+    // close the file stream
+    virtual bool close();
+
+    // the current file offset
+    long tell();
+
+    // set the file offset
+    bool seek(long long offset);
+
+    // reset the file to the beginning
+    bool reset() { return (this->seek(0)); }
+
+    // check the status of i/o operations
+    bool IOinError();
+
+    // read/write
+    // len: number of bytes in data
+    bool read(char* data, long long len);
+    bool write(const char* data, long long len);
+
+protected:
+
+    std::string ioTag_;
+    std::fstream fid_;
+    bool readFlag_;
+};
+
+class EXPORTGTPLUS gtPlusIOBase
+{
+public:
+
+    gtPlusIOBase() {}
+    virtual ~gtPlusIOBase() {}
+
+public:
+
+    void printInfo(std::ostream& os);
+
+    // buffer read/write functions
+    // length: number of bytes
+    bool readFromFile(const std::string& filename, char*& data, long long& length);
+    bool writeToFile(const std::string& filename, char* data, long long length);
+
+    // general export/input for ND array
+    //template <typename T> bool exportNDArray(const hoNDArray<T>& a, const std::string& filename) const;
+    //template <typename T> bool importNDArray(hoNDArray<T>& a, std::string& filename) const;
+};
+
+/*template <typename T>
+bool gtPlusIOBase::exportNDArray(const hoNDArray<T>& a, const std::string& filename) const
+{
+    GADGET_CHECK_RETURN_FALSE( Gadgetron::write_nd_array(const_cast<hoNDArray<T>* >(&a), filename) == 0 );
+    return true;
+}
+
+template <typename T> 
+bool gtPlusIOBase::importNDArray(hoNDArray<T>& a, std::string& filename) const
+{
+    try
+    {
+        boost::shared_ptr< hoNDArray<T> > aRead;
+        aRead = Gadgetron::read_nd_array(filename);
+        a = *aRead;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusIOBase::importNDArray(hoNDArray<T>& a, const std::string& filename) ... ");
+        return false;
+    }
+
+    return true;
+}*/
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusMemoryManager.cpp b/toolboxes/gtplus/util/gtPlusMemoryManager.cpp
new file mode 100644
index 0000000..078bf5d
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusMemoryManager.cpp
@@ -0,0 +1,388 @@
+/** \file       gtPlusMemoryManager.cpp
+    \brief      Implement a simple memory manager for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#include <gtPlusMemoryManager.h>
+#include <cstring>
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusMemoryManager::gtPlusMemoryManager(size_t aligned_bytes, size_t preallocated_bytes) : aligned_bytes_(aligned_bytes)
+{
+    try
+    {
+        memory_.reserve(1024);
+        memObjList_.reserve(1024);
+        GADGET_CHECK_THROW(increase(preallocated_bytes));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusMemoryManager::gtPlusMemoryManager(aligned_bytes, preallocated_bytes) : " << preallocated_bytes/1024/1024 << " MegaBytes ... ");
+    }
+}
+
+gtPlusMemoryManager::~gtPlusMemoryManager()
+{
+    try
+    {
+        // release all chunks hold by manager
+        size_t num = memory_.size();
+        for ( size_t ii=0; ii<num; ii++ )
+        {
+            _deallocate_memory(memory_[ii].mem_chunk_ptr_);
+            memory_[ii].len_chunk_bytes_ = 0;
+        }
+
+        // clear the list
+        allocated_list_.clear();
+        free_list_.clear();
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusMemoryManager::~gtPlusMemoryManager() ... ");
+    }
+}
+
+void* gtPlusMemoryManager::allocate(size_t size)
+{
+    void* ptr = NULL;
+    mutex_.lock();
+    ptr = allocateImpl(size);
+    mutex_.unlock();
+    return ptr;
+}
+
+void* gtPlusMemoryManager::allocateImpl(size_t size)
+{
+    try
+    {
+        if ( size == 0 )
+        {
+            return NULL;
+        }
+
+        // go through the free list and find a block big enough
+        MemoryListType::iterator iter = free_list_.begin();
+        for ( ; iter!=free_list_.end(); iter++ )
+        {
+            if ( iter->second.len_bytes_ >= size )
+            {
+                break;
+            }
+        }
+
+        if ( iter == free_list_.end() )
+        {
+            this->defragmentImpl();
+
+            iter = free_list_.begin();
+            for ( ; iter!=free_list_.end(); iter++ )
+            {
+                if ( iter->second.len_bytes_ >= size )
+                {
+                    break;
+                }
+            }
+
+            if ( iter == free_list_.end() )
+            {
+                // increase the managed buffer
+                if ( !increase(size) ) return NULL;
+                // if ( !increase(size) ) return NULL;
+
+                // allocate the last chunk
+                return this->allocateChunkAsUsed(memory_.size()-1);
+            }
+        }
+
+        if ( iter != free_list_.end() )
+        {
+            gtPlusMemoryObj obj = iter->second;
+
+            gtPlusMemoryObj allocateObj = obj;
+            allocateObj.len_bytes_ = size;
+            if ( allocateObj.len_bytes_%aligned_bytes_ != 0 )
+            {
+                allocateObj.len_bytes_ = aligned_bytes_ - (allocateObj.len_bytes_%aligned_bytes_);
+            }
+
+            gtPlusMemoryObj freeObj = obj;
+            freeObj.len_bytes_ = obj.len_bytes_-allocateObj.len_bytes_;
+            freeObj.mem_ptr_ = (void*)((char*)obj.mem_ptr_+allocateObj.len_bytes_);
+
+            // modify the current free obj
+            free_list_.erase(iter);
+            free_list_[freeObj.mem_ptr_] = freeObj;
+
+            // insert the allocated mem into allocated list
+            allocated_list_[allocateObj.mem_ptr_] = allocateObj;
+
+            return allocateObj.mem_ptr_;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Error happened in gtPlusMemoryManager::allocate(size_t size) : " << size);
+        return NULL;
+    }
+
+    return NULL;
+}
+
+void gtPlusMemoryManager::free(void* raw_memory)
+{
+    mutex_.lock();
+    freeImpl(raw_memory);
+    mutex_.unlock();
+}
+
+void gtPlusMemoryManager::freeImpl(void* raw_memory)
+{
+    try
+    {
+        MemoryListType::iterator iter;
+        iter = allocated_list_.find(raw_memory);
+        if(iter == allocated_list_.end()) return;
+
+        free_list_[iter->first] = iter->second;
+        allocated_list_.erase(iter);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Error happened in gtPlusMemoryManager::freeImpl(raw_memory) ... ");
+    }
+}
+
+void gtPlusMemoryManager::defragment()
+{
+    mutex_.lock();
+    defragmentImpl();
+    mutex_.unlock();
+}
+
+void gtPlusMemoryManager::defragmentImpl()
+{
+    try
+    {
+        GADGET_MSG("-----> gtPlusMemoryManager::defragmentImpl() ... ");
+        size_t N = free_list_.size();
+        memObjList_.resize(N);
+
+        size_t ii=0;
+        MemoryListType::iterator iter = free_list_.begin();
+        for ( ; iter!=free_list_.end(); iter++ )
+        {
+            memObjList_[ii++] = iter->second;
+        }
+
+        std::sort(memObjList_.begin(), memObjList_.end(), MemoryObjCompare() );
+
+        size_t numChunk = memory_.size();
+
+        size_t n, jj;
+
+        for ( n=0; n<numChunk; n++ )
+        {
+            size_t start(0), end(0);
+            N = memObjList_.size();
+            bool rangeFound = false;
+
+            for ( ii=0; ii<N; ii++ )
+            {
+                if( memObjList_[ii].chunk_id_ == n )
+                {
+                    start = ii;
+                    for ( jj=start+1; jj<N; jj++ )
+                    {
+                        if( memObjList_[jj].chunk_id_ > n )
+                        {
+                            end = jj-1;
+                            rangeFound = true;
+                            break;
+                        }
+                    }
+
+                    if( rangeFound )
+                    {
+                        break;
+                    }
+                }
+            }
+
+            if ( end > start )
+            {
+                while ( true )
+                {
+                    for ( ii=start; ii<end; ii++ )
+                    {
+                        if ( ((char*)memObjList_[ii].mem_ptr_+memObjList_[ii].len_bytes_) >= memObjList_[ii+1].mem_ptr_ )
+                        {
+                            // combine ii and ii+1 
+                            gtPlusMemoryObj obj(memObjList_[ii]);
+                            obj.len_bytes_ = (char*)(memObjList_[ii+1].mem_ptr_)+memObjList_[ii+1].len_bytes_ - (char*)memObjList_[ii].mem_ptr_;
+
+                            memObjList_[ii] = obj;
+                            memObjList_.erase(memObjList_.begin()+ii+1);
+
+                            end--;
+
+                            ii=start;
+                            break;
+                        }
+                    }
+
+                    if ( ii == end )
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        free_list_.clear();
+
+        N = memObjList_.size();
+        for ( ii=0; ii<N; ii++ )
+        {
+            free_list_[memObjList_[ii].mem_ptr_] = memObjList_[ii];
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Error happened in gtPlusMemoryManager::defragmentImpl() ... ");
+    }
+}
+
+void* gtPlusMemoryManager::allocateChunkAsUsed(size_t chunkId)
+{
+    gtPlusMemoryChunkObj mem = memory_[chunkId];
+
+    // add the chunk to the allocated list
+    gtPlusMemoryObj obj;
+    obj.chunk_id_ = chunkId;
+    obj.mem_ptr_ = mem.mem_chunk_ptr_;
+    obj.len_bytes_ = mem.len_chunk_bytes_;
+
+    allocated_list_[obj.mem_ptr_] = obj;
+
+    // remove the chunk from the free list
+    MemoryListType::iterator it;
+    it = free_list_.find(obj.mem_ptr_);
+    if(it == free_list_.end()) return NULL;
+    free_list_.erase(it);
+
+    return mem.mem_chunk_ptr_;
+}
+
+bool gtPlusMemoryManager::increase(size_t added_bytes)
+{
+    try
+    {
+        GADGET_MSG("-----> gtPlusMemoryManager::increase() : " << added_bytes/1024/1024 << " MegaBytes ");
+
+        void* ptr;
+        _allocate_memory(added_bytes, ptr);
+        if ( ptr==NULL ) return false;
+
+        // create on chuck
+        gtPlusMemoryChunkObj chunk;
+        chunk.len_chunk_bytes_ = added_bytes;
+        chunk.mem_chunk_ptr_ = ptr;
+
+        // insert into the lists
+        memory_.push_back(chunk);
+
+        // insert into the free list
+        gtPlusMemoryObj mem;
+        mem.chunk_id_ = memory_.size()-1;
+        mem.mem_ptr_ = ptr;
+        mem.len_bytes_ = added_bytes;
+
+        free_list_[ptr] = mem;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusMemoryManager::increase(size_t added_bytes) : " << added_bytes/1024/1024 << " MegaBytes ...");
+        return false;
+    }
+
+    return true;
+}
+
+void gtPlusMemoryManager::_allocate_memory( size_t size, void*& data )
+{
+    #ifdef USE_MKL
+        data = mkl_calloc(size, 1, aligned_bytes_);
+    #else
+        data = calloc(size, 1);
+    #endif // USE_MKL
+
+    // data = reinterpret_cast<void*>(new char[size]);
+    //memset(data, 0, size);
+}
+
+void gtPlusMemoryManager::_deallocate_memory( void* data )
+{
+    #ifdef USE_MKL
+        mkl_free(data);
+    #else
+         free(data);
+    #endif // USE_MKL
+
+    // delete [] data;
+}
+
+size_t gtPlusMemoryManager::totalFreeMemory() const
+{
+    size_t memSize = 0;
+    MemoryListType::const_iterator iter = free_list_.begin();
+    for ( ; iter!=free_list_.end(); iter++ )
+    {
+        memSize += iter->second.len_bytes_;
+    }
+    return memSize;
+}
+
+size_t gtPlusMemoryManager::maxFreeMemoryChunkSize() const
+{
+    size_t maxChunkSize = 0;
+    MemoryListType::const_iterator iter = free_list_.begin();
+    for ( ; iter!=free_list_.end(); iter++ )
+    {
+        if ( iter->second.len_bytes_ > maxChunkSize ) maxChunkSize = iter->second.len_bytes_;
+    }
+    return maxChunkSize;
+}
+
+void gtPlusMemoryManager::printInfo(std::ostream& os)
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD Recon Memory Manager -------------" << endl;
+    os << "Implementation of a simple memory manager for large chunk memory management" << endl;
+    os << "Managed chunk : " << memory_.size() << endl;
+    size_t ii;
+    for ( ii=0; ii<memory_.size(); ii++ )
+    {
+        os << "--> Chunk " << ii << " - " << memory_[ii].len_chunk_bytes_/1024 << " kiloBytes <--" << endl;
+    }
+    os << "----------------------------------" << endl;
+    os << "Allocated memory pieces  : " << allocated_list_.size() << endl;
+    MemoryListType::iterator iter = allocated_list_.begin();
+    ii=0;
+    for ( ; iter!=allocated_list_.end(); iter++ )
+    {
+        os << "--> Allocated " << ii++ << " - " << iter->second.len_bytes_/1024 << " kiloBytes <--" << endl;
+    }
+    os << "----------------------------------" << endl;
+    os << "Free memory pieces  : " << free_list_.size() << endl;
+    iter = free_list_.begin();
+    ii=0;
+    for ( ; iter!=free_list_.end(); iter++ )
+    {
+        os << "--> Free " << ii++ << " - " << iter->second.len_bytes_/1024 << " kiloBytes <--" << endl;
+    }
+    os << "----------------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/util/gtPlusMemoryManager.h b/toolboxes/gtplus/util/gtPlusMemoryManager.h
new file mode 100644
index 0000000..f2b09ac
--- /dev/null
+++ b/toolboxes/gtplus/util/gtPlusMemoryManager.h
@@ -0,0 +1,139 @@
+/** \file       gtPlusMemoryManager.h
+    \brief      Implement a simple memory manager for GtPlus toolbox
+    \author     Hui Xue
+*/
+
+#pragma once
+
+#include "GadgetronCommon.h"
+#include <iostream>
+#include <typeinfo>
+
+#include <boost/thread/mutex.hpp>
+#include "GtPlusExport.h"
+#include "GadgetronTimer.h"
+#include "GadgetronException.h"
+
+
+#include <map>
+#include <vector>
+#include <typeinfo>
+#include <string>
+#include <limits>
+
+#include <mkl.h>
+
+// the memory manager for large chunk allocation
+
+namespace Gadgetron { namespace gtPlus {
+
+struct gtPlusMemoryObj
+{
+    // chunk number holding this memory
+    size_t chunk_id_;
+
+    // starting address of this memory
+    void* mem_ptr_;
+
+    // memory size in bytes
+    size_t len_bytes_;
+};
+
+struct gtPlusMemoryChunkObj
+{
+    // chunk starting address
+    void* mem_chunk_ptr_;
+
+    // memory size in bytes
+    size_t len_chunk_bytes_;
+};
+
+struct MemoryObjCompare
+{
+    MemoryObjCompare() {}
+    ~MemoryObjCompare() {}
+
+    bool operator()(const gtPlusMemoryObj& a, const gtPlusMemoryObj& b) const
+    {
+        if ( a.chunk_id_ == b.chunk_id_ )
+        {
+            return (a.mem_ptr_ <= b.mem_ptr_);
+        }
+        else
+        {
+            return (a.chunk_id_ <= b.chunk_id_);
+        }
+    }
+};
+
+class EXPORTGTPLUS gtPlusMemoryManager
+{
+public:
+
+    typedef std::map<void*, gtPlusMemoryObj> MemoryListType;
+
+    gtPlusMemoryManager(size_t aligned_bytes=4, size_t preallocated_bytes=4294967296);
+    virtual ~gtPlusMemoryManager();
+
+    // allocate memory
+    void* allocate(size_t size);
+
+    // free memory
+    void free(void* raw_memory);
+
+    // increase the managed memory size
+    virtual bool increase(size_t added_bytes);
+
+    // defragment, combine the free memory obj if possible
+    // whenever a memory requirement cannot be fullfilled, this funtion is called once before allocating new memory
+    // user can call this function explictly to improve the efficiency
+    void defragment();
+
+    // total amount of free memory
+    size_t totalFreeMemory() const;
+
+    // maximal free memory chunk
+    size_t maxFreeMemoryChunkSize() const;
+
+    // print out the memory manager information
+    void printInfo(std::ostream& os);
+
+protected:
+
+    // the allocated list
+    MemoryListType allocated_list_;
+
+    // the free list
+    MemoryListType free_list_;
+
+    // memory chunk hold by the manager
+    std::vector<gtPlusMemoryChunkObj> memory_;
+
+    // store the copy of memory objects
+    std::vector<gtPlusMemoryObj> memObjList_;
+
+    // aligned bytes
+    size_t aligned_bytes_;
+
+    // make sure the allocate and free are thread-safe
+    boost::mutex mutex_;
+
+    // allocate memory
+    void* allocateImpl(size_t size);
+
+    // free memory
+    void freeImpl(void* raw_memory);
+
+    // defragment
+    void defragmentImpl();
+
+    // perform memory allocation and release with system calls
+    void _allocate_memory( size_t size, void*& data );
+    void _deallocate_memory( void* data );
+
+    // allocate specified chunk as allocated memory
+    // return the mem_chunk_ptr_ for this chunk
+    void* allocateChunkAsUsed(size_t chunkId);
+};
+
+}}
diff --git a/toolboxes/gtplus/util/hoNDArrayMemoryManaged.h b/toolboxes/gtplus/util/hoNDArrayMemoryManaged.h
new file mode 100644
index 0000000..45e6f2b
--- /dev/null
+++ b/toolboxes/gtplus/util/hoNDArrayMemoryManaged.h
@@ -0,0 +1,144 @@
+/** \file hoNDArrayMemoryManaged.h
+    \brief CPU-based N-dimensional array (data container) using the memory manager to allocate memory
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "gtPlusMemoryManager.h"
+
+namespace Gadgetron{
+
+  template <typename T> class hoNDArrayMemoryManaged : public Gadgetron::hoNDArray<T>
+  {
+  public:
+
+    typedef Gadgetron::hoNDArray<T> BaseClass;
+    typedef boost::shared_ptr<Gadgetron::gtPlus::gtPlusMemoryManager> MemManagerType;
+
+    hoNDArrayMemoryManaged();
+    hoNDArrayMemoryManaged(MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(std::vector<size_t> *dimensions, MemManagerType& mem_manager);
+
+    explicit hoNDArrayMemoryManaged(size_t len, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, MemManagerType& mem_manager);
+    explicit hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, MemManagerType& mem_manager);
+
+    explicit hoNDArrayMemoryManaged(boost::shared_ptr< std::vector<size_t> > dimensions, MemManagerType& mem_manager);
+
+    virtual ~hoNDArrayMemoryManaged();
+
+    // Copy constructor
+    hoNDArrayMemoryManaged(const hoNDArrayMemoryManaged<T>& a);
+
+    // construct from hoNDArray
+    hoNDArrayMemoryManaged(const hoNDArray<T>& a, MemManagerType& mem_manager);
+
+    // Assignment operator
+    hoNDArrayMemoryManaged& operator=(const hoNDArrayMemoryManaged<T>& rhs);
+    hoNDArrayMemoryManaged& operator=(const hoNDArray<T>& rhs);
+
+    // set memory manager
+    void setMemoryManager(MemManagerType& mem_manager);
+
+    virtual void print(std::ostream& os) const;
+
+  protected:
+
+    using BaseClass::dimensions_;
+    using BaseClass::offsetFactors_;
+    using BaseClass::data_;
+    using BaseClass::elements_;
+    using BaseClass::delete_data_on_destruct_;
+
+    MemManagerType mem_manager_;
+
+    // Generic allocator / deallocator
+    //
+
+    template<class X> void _allocate_memory( size_t size, X** data )
+    {
+        if ( mem_manager_ )
+        {
+            void* ptr = mem_manager_->allocate(sizeof(X)*size);
+            *data = (X*)ptr;
+            for ( size_t ii=0; ii<size; ii++ )
+            {
+                X* pt = (X*)ptr + ii;
+                data[ii] = new(pt) X();
+            }
+        }
+        else
+        {
+            *data = new X[size];
+        }
+    }
+
+    template<class X> void _deallocate_memory( X* data )
+    {
+        for ( size_t ii=0; ii<this->element_; ii++ )
+        {
+            data[ii].~X();
+        }
+
+        if ( mem_manager_ )
+        {
+            mem_manager_->free( (void*)data );
+        }
+    }
+
+    // Overload these instances to avoid invoking the element class constructor/destructor
+    //
+
+    virtual void _allocate_memory( size_t size, float** data );
+    virtual void _deallocate_memory( float* data );
+
+    virtual void _allocate_memory( size_t size, double** data );
+    virtual void _deallocate_memory( double* data );
+
+    virtual void _allocate_memory( size_t size, std::complex<float>** data );
+    virtual void _deallocate_memory( std::complex<float>* data );
+
+    virtual void _allocate_memory( size_t size, std::complex<double>** data );
+    virtual void _deallocate_memory( std::complex<double>* data );
+
+    virtual void _allocate_memory( size_t size, float_complext** data );
+    virtual void _deallocate_memory( float_complext* data );
+
+    virtual void _allocate_memory( size_t size, double_complext** data );
+    virtual void _deallocate_memory( double_complext* data );
+
+    template<class TYPE, unsigned int D> void _allocate_memory( size_t size, vector_td<TYPE,D>** data )
+    {
+        if ( mem_manager_ )
+        {
+            void* ptr = mem_manager_->allocate( size*sizeof(vector_td<TYPE,D>) );
+            *data = (vector_td<TYPE,D>*)ptr;
+        }
+        else
+        {
+            *data = (vector_td<TYPE,D>*) malloc( size*sizeof(vector_td<TYPE,D>) );
+        }
+    }
+
+    template<class TYPE, unsigned int D>  void _deallocate_memory( vector_td<TYPE,D>* data )
+    {
+        if ( mem_manager_ )
+        {
+            mem_manager_->free( (void*)data );
+        }
+        else
+        {
+            free ( (void*)data );
+        }
+    }
+  };
+}
+
+#include "hoNDArrayMemoryManaged.hxx"
diff --git a/toolboxes/gtplus/util/hoNDArrayMemoryManaged.hxx b/toolboxes/gtplus/util/hoNDArrayMemoryManaged.hxx
new file mode 100644
index 0000000..03cdc86
--- /dev/null
+++ b/toolboxes/gtplus/util/hoNDArrayMemoryManaged.hxx
@@ -0,0 +1,377 @@
+
+namespace Gadgetron
+{
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged()
+{
+    BaseClass();
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    BaseClass();
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(std::vector<size_t> *dimensions, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    this->create(dimensions);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t len, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(1);
+    dim[0] = len;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(2);
+    dim[0] = sx;
+    dim[1] = sy;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(3);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(4);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(5);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(6);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(7);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(size_t sx, size_t sy, size_t sz, size_t st, size_t sp, size_t sq, size_t sr, size_t ss, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    std::vector<size_t> dim(8);
+    dim[0] = sx;
+    dim[1] = sy;
+    dim[2] = sz;
+    dim[3] = st;
+    dim[4] = sp;
+    dim[5] = sq;
+    dim[6] = sr;
+    dim[7] = ss;
+    this->create(dim);
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(boost::shared_ptr< std::vector<size_t> > dimensions, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    this->create(dimensions.get());
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::~hoNDArrayMemoryManaged()
+{
+    // GADGET_MSG("In ~hoNDArrayMemoryManaged()");
+    if (this->delete_data_on_destruct_)
+    {
+        //if ( mem_manager_ )
+        //{
+            this->deallocate_memory();
+            this->data_ = NULL;
+        //}
+    }
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(const hoNDArrayMemoryManaged<T>& a)
+{
+    this->mem_manager_ = a.mem_manager_;
+    this->data_ = 0;
+    this->dimensions_ = a.dimensions_;
+    this->offsetFactors_ = a.offsetFactors_;
+    this->allocate_memory();
+    memcpy( this->data_, a.data_, this->elements_*sizeof(T) );
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>::hoNDArrayMemoryManaged(const hoNDArray<T>& a, MemManagerType& mem_manager)
+: mem_manager_(mem_manager)
+{
+    this->data_ = 0;
+    this->dimensions_ = a.get_dimensions();
+    this->offsetFactors_ = a.get_offset_factor();
+    this->allocate_memory();
+    memcpy( this->data_, const_cast<T*>(a.begin()), this->elements_*sizeof(T) );
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>& hoNDArrayMemoryManaged<T>::operator=(const hoNDArrayMemoryManaged<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+    BaseClass::operator=( dynamic_cast< const hoNDArray<T>& >(rhs) );
+    return *this;
+}
+
+template <typename T> 
+hoNDArrayMemoryManaged<T>& hoNDArrayMemoryManaged<T>::operator=(const hoNDArray<T>& rhs)
+{
+    if ( &rhs == this ) return *this;
+    BaseClass::operator=( rhs );
+    return *this;
+}
+
+template <typename T> 
+void hoNDArrayMemoryManaged<T>::setMemoryManager(MemManagerType& mem_manager)
+{
+    mem_manager_ = mem_manager;
+}
+
+template <typename T> 
+void hoNDArrayMemoryManaged<T>::print(std::ostream& os) const
+{
+    using namespace std;
+
+    os.unsetf(std::ios::scientific);
+    os.setf(ios::fixed);
+
+    size_t i;
+
+    os << "-------------- Gagdgetron ND Array controlled by GtPlus memory manager -------------" << endl;
+    os << "Array dimension is : " << dimensions_->size() << endl;
+
+    os << "Array size is : ";
+    for (i=0; i<dimensions_->size(); i++ ) 
+        os << (*dimensions_)[i] << " "; 
+    os << endl;
+
+    int elemTypeSize = sizeof(T);
+    std::string elemTypeName = std::string(typeid(T).name());
+
+    os << "Array data type is : " << elemTypeName << std::endl;
+    os << "Byte number for each element is : " << elemTypeSize << std::endl;
+    os << "Number of array size in bytes is : ";
+    os << elements_*elemTypeSize << std::endl;
+
+    os << std::endl;
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_allocate_memory( size_t size, float** data )
+{
+    if ( mem_manager_ )
+    {
+        *data = (float*) mem_manager_->allocate(size*sizeof(float));
+    }
+    else
+    {
+        BaseClass::_allocate_memory(size, data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_deallocate_memory( float* data )
+{
+    if ( mem_manager_ )
+    {
+        mem_manager_->free( (void*)data );
+    }
+    else
+    {
+        BaseClass::_deallocate_memory(data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_allocate_memory( size_t size, double** data )
+{
+    if ( mem_manager_ )
+    {
+        *data = (double*) mem_manager_->allocate(size*sizeof(double));
+    }
+    else
+    {
+        BaseClass::_allocate_memory(size, data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_deallocate_memory( double* data )
+{
+    if ( mem_manager_ )
+    {
+        mem_manager_->free( (void*)data );
+    }
+    else
+    {
+        BaseClass::_deallocate_memory(data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_allocate_memory( size_t size, std::complex<float>** data )
+{
+    //GADGET_MSG("-----> In hoNDArrayMemoryManaged::_allocate_memory(size) : " << size/1024/1024.0 << " MegaBytes ");
+    if ( mem_manager_ )
+    {
+        *data = (std::complex<float>*) mem_manager_->allocate(size*sizeof(std::complex<float>));
+    }
+    else
+    {
+        BaseClass::_allocate_memory(size, data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_deallocate_memory( std::complex<float>* data )
+{
+    if ( mem_manager_ )
+    {
+        mem_manager_->free( (void*)data );
+    }
+    else
+    {
+        BaseClass::_deallocate_memory(data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_allocate_memory( size_t size, std::complex<double>** data )
+{
+    if ( mem_manager_ )
+    {
+        *data = (std::complex<double>*) mem_manager_->allocate(size*sizeof(std::complex<double>));
+    }
+    else
+    {
+        BaseClass::_allocate_memory(size, data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_deallocate_memory( std::complex<double>* data )
+{
+    if ( mem_manager_ )
+    {
+        mem_manager_->free( (void*)data );
+    }
+    else
+    {
+        BaseClass::_deallocate_memory(data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_allocate_memory( size_t size, float_complext** data )
+{
+    if ( mem_manager_ )
+    {
+        *data = (float_complext*) mem_manager_->allocate(size*sizeof(float_complext));
+    }
+    else
+    {
+        BaseClass::_allocate_memory(size, data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_deallocate_memory( float_complext* data )
+{
+    if ( mem_manager_ )
+    {
+        mem_manager_->free( (void*)data );
+    }
+    else
+    {
+        BaseClass::_deallocate_memory(data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_allocate_memory( size_t size, double_complext** data )
+{
+    if ( mem_manager_ )
+    {
+        *data = (double_complext*) mem_manager_->allocate(size*sizeof(double_complext));
+    }
+    else
+    {
+        BaseClass::_allocate_memory(size, data);
+    }
+}
+
+template <typename T> 
+inline void hoNDArrayMemoryManaged<T>::_deallocate_memory( double_complext* data )
+{
+    if ( mem_manager_ )
+    {
+        mem_manager_->free( (void*)data );
+    }
+    else
+    {
+        BaseClass::_deallocate_memory(data);
+    }
+}
+
+}
+
diff --git a/toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp
new file mode 100644
index 0000000..3958ecd
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.cpp
@@ -0,0 +1,157 @@
+/** \file   gtPlusCloudScheduler.cpp
+    \brief  Define and implement the GtPlus cloud job scheduler class
+            A simple scheduling strategy is implemented here. The number of job packages which are sent
+            to a node is propotional to the computing power index for that node.
+
+            This class may serve as the base class to implement more complicated job scheduling strategies.
+
+    \author Hui Xue
+*/
+
+#include "gtPlusCloudScheduler.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+gtPlusCloudScheduler::gtPlusCloudScheduler() : num_of_nodes_(0), num_of_jobs_(0)
+{
+}
+
+gtPlusCloudScheduler::~gtPlusCloudScheduler()
+{
+}
+
+void gtPlusCloudScheduler::printInfo(std::ostream& os) const
+{
+    using namespace std;
+
+    os << "-------------- GTPlus Cloud scheduler for jobs ---------------" << endl;
+    os << "This class implements the simple scheduling scheme for GtPlus cloud " << endl;
+    os << "The scheduler here tries to allocate nodes to jobs propotional to the power indexes provided " << endl;
+    os << "--------------------------------------------------------------" << endl;
+}
+
+void gtPlusCloudScheduler::setNumOfJobs(size_t numOfJobs)
+{
+    num_of_jobs_ = numOfJobs;
+}
+
+void gtPlusCloudScheduler::setUpNodes(size_t numOfNodes)
+{
+    num_of_nodes_ = numOfNodes;
+    if ( num_of_nodes_ > 0 )
+    {
+        node_id_computing_power_indexes_.resize(num_of_nodes_);
+        for ( size_t ii=0; ii<num_of_nodes_; ii++ )
+        {
+            node_id_computing_power_indexes_[ii].first = ii;
+            node_id_computing_power_indexes_[ii].second = 1.0;
+        }
+    }
+}
+
+void gtPlusCloudScheduler::setUpNodes(const std::vector<double>& nodeComputingPowerIndexes)
+{
+    num_of_nodes_ = nodeComputingPowerIndexes.size();
+    node_id_computing_power_indexes_.resize(num_of_nodes_);
+
+    for ( size_t ii=0; ii<num_of_nodes_; ii++ )
+    {
+        node_id_computing_power_indexes_[ii].first = (int)ii;
+        node_id_computing_power_indexes_[ii].second = nodeComputingPowerIndexes[ii];
+    }
+}
+
+struct gtPlusCloudSchedulerNodeSorter
+{
+    gtPlusCloudSchedulerNodeSorter() {}
+    ~gtPlusCloudSchedulerNodeSorter() {}
+
+    bool operator()(const std::pair<int, double>& A, const std::pair<int, double>& B) const
+    {
+        return (A.second > B.second);
+    }
+};
+
+bool gtPlusCloudScheduler::schedulerJobs(std::vector<int>& nodeIDforJobs)
+{
+    try
+    {
+        size_t ii;
+
+        nodeIDforJobs.clear();
+
+        if ( num_of_nodes_==0 || num_of_jobs_==0 )
+        {
+            GADGET_WARN_MSG("num_of_nodes_==0 || num_of_jobs_==0");
+            return true;
+        }
+
+        if ( node_id_computing_power_indexes_.size() < num_of_nodes_ )
+        {
+            GADGET_WARN_MSG("node_computing_power_indexes_.size() < num_of_nodes_ : computing power indexes for all nodes are set to be equal ... ");
+            node_id_computing_power_indexes_.resize(num_of_nodes_, std::pair<int, double>(0, 1.0) );
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                node_id_computing_power_indexes_[ii].first = (int)ii;
+            }
+        }
+
+        nodeIDforJobs.resize(num_of_jobs_, -1);
+
+        // always sort the nodes with higher computing power node ahead
+        std::sort(node_id_computing_power_indexes_.begin(), node_id_computing_power_indexes_.end(), gtPlusCloudSchedulerNodeSorter() );
+
+        if ( num_of_jobs_ <= num_of_nodes_ )
+        {
+            for ( ii=0; ii<num_of_jobs_; ii++ )
+            {
+                nodeIDforJobs[ii] = node_id_computing_power_indexes_[ii].first;
+            }
+        }
+        else
+        {
+            double totalComputingPower = 0.0;
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                totalComputingPower += node_id_computing_power_indexes_[ii].second;
+            }
+
+            size_t totalJobAllocated = 0;
+            std::vector<size_t> jobPerNode(num_of_nodes_, 0);
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                jobPerNode[ii] = (size_t)(std::floor(num_of_jobs_ * node_id_computing_power_indexes_[ii].second/totalComputingPower));
+                totalJobAllocated += jobPerNode[ii];
+            }
+
+            if ( totalJobAllocated < num_of_jobs_ )
+            {
+                // give high computing power nodes more jobs
+                for ( ii=0; ii<(num_of_jobs_-totalJobAllocated); ii++ )
+                {
+                    jobPerNode[ii%num_of_nodes_]++;
+                }
+            }
+
+            size_t jobID = 0;
+            for ( ii=0; ii<num_of_nodes_; ii++ )
+            {
+                for ( size_t jj=0; jj<jobPerNode[ii]; jj++ )
+                {
+                    nodeIDforJobs[jobID++] = node_id_computing_power_indexes_[ii].first;
+                }
+            }
+
+            GADGET_CHECK_RETURN_FALSE(jobID==num_of_jobs_);
+        }
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusCloudScheduler::schedulerJobs(std::vector<int>& nodeIDforJobs) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusCloudScheduler.h b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.h
new file mode 100644
index 0000000..68c6e7a
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusCloudScheduler.h
@@ -0,0 +1,54 @@
+/** \file   gtPlusCloudScheduler.h
+    \brief  Define and implement the GtPlus cloud job scheduler class
+            A simple scheduling strategy is implemented here. The number of job packages which are sent
+            to a node is propotional to the computing power index for that node.
+
+            This class may serve as the base class to implement more complicated job scheduling strategies.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusExport.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+/**
+The scheduler class for gadgetron cloud.
+This class can serves as the base class for more complicated scheduling strategy.
+*/
+
+class EXPORTGTPLUS gtPlusCloudScheduler
+{
+public:
+
+    gtPlusCloudScheduler();
+    virtual ~gtPlusCloudScheduler();
+
+    virtual void printInfo(std::ostream& os) const;
+
+    // compute the scheduling for every job
+    // nodeIDforJobs stores the node ID to run every job
+    // node ID starts from 0
+    virtual bool schedulerJobs(std::vector<int>& nodeIDforJobs);
+
+    void setNumOfJobs(size_t numOfJobs);
+
+    void setUpNodes(size_t numOfNodes);
+    void setUpNodes(const std::vector<double>& nodeComputingPowerIndexes);
+
+protected:
+
+    // number of nodes
+    size_t num_of_nodes_;
+
+    // number of jobs, for this simple scheduler, all jobs are considered to have equal sizes
+    size_t num_of_jobs_;
+
+    // computing power indexes for every nodes; if not set, all nodes are treated to have equal computing powers
+    std::vector<std::pair<int, double> > node_id_computing_power_indexes_;
+};
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.cpp b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.cpp
new file mode 100644
index 0000000..a82824a
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.cpp
@@ -0,0 +1,18 @@
+
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+//
+// Instantiation
+//
+
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil<float>;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil<double>;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil<GT_Complex8>;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtil<GT_Complex16>;
+
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtilComplex<GT_Complex8>;
+template EXPORTGTPLUS class gtPlusISMRMRDReconUtilComplex<GT_Complex16>;
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h
new file mode 100644
index 0000000..0cac782
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.h
@@ -0,0 +1,534 @@
+/** \file   gtPlusISMRMRDReconUtil.h
+    \brief  Define the symbols and implement common functionalities for GtPlus toolbox
+
+            The ISMRMRD format is fully supported in this toolbox.
+
+            Other functinalities implemented here include:
+            Karhunen-Lo�ve Transform (KLT) or Principle Component Analysis (PCA)
+            KSpace filter
+            Several MR sensitivity map estimation methods
+
+            Ref to :
+
+            http://en.wikipedia.org/wiki/Karhunen%E2%80%93Lo%C3%A8ve_theorem
+
+            ISMRMRD_SOUHEIL coil map estimation is based on:
+
+                Inati SJ, Hansen MS, Kellman P. 
+                A solution to the phase problem in adaptive coil combination. 
+                In: ISMRM proceeding; 20�26 april; salt lake city, utah, USA. ; 2013. 2672.
+
+                Kellman P, McVeigh ER. 
+                Image reconstruction in SNR units: A general method for SNR measurement. 
+                Magnetic Resonance in Medicine 2005;54(6):1439-1447.
+
+            ISMRMRD_SOUHEIL_ITER coil map estimation is not implemented yet.
+
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "GtPlusExport.h"
+
+#include "ismrmrd.h"
+
+#include "boost/tuple/tuple.hpp"
+#include "boost/tuple/tuple_comparison.hpp"
+#include "boost/tuple/tuple_io.hpp"
+
+#include "ho2DArray.h"
+#include "ho3DArray.h"
+#include "ho4DArray.h"
+#include "ho5DArray.h"
+#include "ho6DArray.h"
+#include "ho7DArray.h"
+#include "hoMatrix.h"
+#include "hoNDFFT.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_blas.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_operators.h"
+#include "util/gtPlusIOAnalyze.h"
+#include "hoNDArrayMemoryManaged.h"
+#include "GadgetronTimer.h"
+
+#ifdef USE_OMP
+    #include <omp.h>
+#endif // USE_OMP
+
+#ifdef USE_CUDA
+    #include "GPUTimer.h"
+    #include "b1_map.h"
+    #include "cudaDeviceManager.h"
+    #include "cuNDArray_elemwise.h"
+#endif // USE_CUDA
+
+namespace Gadgetron { namespace gtPlus {
+
+// define the dimensions of ISMRMRD
+enum ISMRMRDDIM
+{
+    DIM_ReadOut = 32,
+    DIM_Encoding1,
+    DIM_Channel,
+    DIM_Slice,
+    DIM_Encoding2,
+    DIM_Contrast,
+    DIM_Phase,
+    DIM_Repetition,
+    DIM_Set,
+    DIM_Segment,
+    DIM_Average,
+    DIM_other1,
+    DIM_other2,
+    DIM_other3,
+    DIM_NONE
+};
+
+// define the reconstruction algorithms
+enum ISMRMRDALGO
+{
+    ISMRMRD_GRAPPA = 64,
+    ISMRMRD_SENSE,
+    ISMRMRD_SPIRIT,
+    ISMRMRD_L1SPIRIT,
+    ISMRMRD_SOFTSENSE,
+    ISMRMRD_L1SOFTSENSE,
+    ISMRMRD_NONE
+};
+
+// define the coil sensitivity map estimation algorithms
+enum ISMRMRDCOILMAPALGO
+{
+    ISMRMRD_SOUHEIL = 96,
+    ISMRMRD_SOUHEIL_ITER
+};
+
+// define the partial fourier/asymmetric echo handling algorithms
+enum ISMRMRDPFALGO
+{
+    ISMRMRD_PF_HOMODYNE = 128,          // iterative homodyne
+    ISMRMRD_PF_POCS,                    // POCS
+    ISMRMRD_PF_FENGHUANG,               // convolution based method
+    ISMRMRD_PF_ZEROFILLING_FILTER,      // zero-filling with partial fourier filter
+    ISMRMRD_PF_ZEROFILLING              // zero-filling without partial fourier filter
+};
+
+// define the kspace filter type
+enum ISMRMRDKSPACEFILTER
+{
+    ISMRMRD_FILTER_GAUSSIAN = 160,
+    ISMRMRD_FILTER_HANNING,
+    ISMRMRD_FILTER_TUKEY,
+    ISMRMRD_FILTER_TAPERED_HANNING,
+    ISMRMRD_FILTER_NONE
+};
+
+// define the calibration mode of ISMRMRD
+enum ISMRMRDCALIBMODE
+{
+    ISMRMRD_embedded = 256,
+    ISMRMRD_interleaved,
+    ISMRMRD_separate,
+    ISMRMRD_external,
+    ISMRMRD_other,
+    ISMRMRD_noacceleration
+};
+
+template <typename T> 
+class gtPlusISMRMRDReconUtil
+{
+public:
+
+    gtPlusISMRMRDReconUtil();
+    virtual ~gtPlusISMRMRDReconUtil();
+
+    void printInfo(std::ostream& os);
+
+    typedef std::pair<ISMRMRDDIM, size_t> DimensionRecordType;
+
+    // ------------------------------------------------------------------------
+    // coil compression and KarhunenLoeverTransform
+    // ------------------------------------------------------------------------
+    // data: M rows and N cols matrix
+    // the KLT direction is along the N
+    // eigenVectors: N*N eigen vectors, every column is a eigen vector
+    // eigenValues: N*1 eigen values, descending order
+    bool KLT_eigenAnalysis(const hoMatrix<T>& data, hoMatrix<T>& eigenVectors, hoMatrix<T>& eigenValues);
+
+    // apply the eigen transform
+    // data: M*N data matrix
+    // eigenVectors: N*K eigen vector matrix, every column is a eigen vector
+    // dataEigen: M*K eigen data matrix
+    bool KLT_applyEigen(const hoMatrix<T>& data, hoMatrix<T>& dataEigen, const hoMatrix<T>& eigenVectors);
+    bool KLT_applyEigen(const hoNDArray<T>& data, hoNDArray<T>& dataEigen, const hoMatrix<T>& eigenVectors);
+
+    // number of kept eigen modes
+    // all modes with eigen values greater than thres*max(eigenValues) are kept
+    bool KLT_numberOfKeptModes(const hoMatrix<T>& eigenValues, double thres, long long& numOfModesKept);
+
+    // prune the eigen vector matrixes to keep the last numOfModesKept columns
+    bool pruneEigenVectorMatrix(const hoMatrix<T>& eigenVectors, long long numOfModesKept, hoMatrix<T>& eigenVectorsPruned);
+
+    // KLT based coil compression
+    // data: at least 3D [RO E1 CHA ...]
+    // the KL transform is applied along CHA
+    // coeff: CHA*numOfModesKept eigen vector matrix
+    // eigenValues: CHA*1 eigen values
+    // thres <0 or numOfModesKept==-1, keep all modes
+    // if isChaLastDim==true, the CHA is the last dimension
+    bool computeKLCoilCompressionCoeff(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim=false);
+    bool computeKLCoilCompressionCoeff(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim=false);
+    // coeff: CHA*CHA eigen vector matrix
+    bool computeKLTCoeff(const hoNDArray<T>& data, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim=false);
+
+    // dataEigen: [RO E1 numOfModesKept ...] 
+    bool computeKLCoilCompression(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+    bool computeKLCoilCompression(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+
+    // apply coil compression coefficients
+    bool appyKLCoilCompressionCoeff(const hoNDArray<T>& data, const hoMatrix<T>& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+
+    // apply coil compression coefficients on array [RO E1 srcCHA ...]
+    // dataEigen: [RO E1 dstCHA ...]
+    // coeff: [srcCHA dstCHA] matrixes for every last dimension
+    // every last dimension has different compression coefficients
+    bool applyKLCoilCompressionCoeff(const hoNDArray<T>& data, const std::vector<hoMatrix<T> >& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim=false);
+
+    // compute KL transform and perform filtering
+    // the KL dimension is the last dimension
+    bool computeKLFilter(const hoNDArray<T>& data, size_t numOfModesKept, hoNDArray<T>& dataKLF);
+
+    // ------------------------------------------------------------------------
+    // zero-padding resize
+    // ------------------------------------------------------------------------
+    // compute the start and end index for zero padding
+    // dstSize >= srcSize
+    bool zpadRange(size_t srcSize, size_t dstSize, size_t& start, size_t& end);
+
+    // pad the first two dimensions around its center, other dimensions are kept unchanged
+    bool zeropad2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataPadded);
+
+    // pad first three dimensions array around its center, other dimensions are kept unchanged
+    bool zeropad3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataPadded);
+
+    // the dataPadded is not pre cleared to fill with zeros
+    bool zeropad3DNoPresetZeros(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataPadded);
+
+    // cut the center part
+    bool cutpad2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataCut);
+    bool cutpad3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataCut);
+
+    // ------------------------------------------------------------------------
+    // kspace filter
+    // ------------------------------------------------------------------------
+    bool compute2DFilterFromTwo1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, hoNDArray<T>& fxy);
+    bool compute2DFilterFromTwo1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, hoNDArray<GT_Complex8>& fxy);
+    bool compute2DFilterFromTwo1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, hoNDArray<GT_Complex16>& fxy);
+
+    bool compute3DFilterFromThree1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, const hoNDArray<T>& fz, hoNDArray<T>& fxyz);
+    bool compute3DFilterFromThree1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, const hoNDArray<float>& fz, hoNDArray<GT_Complex8>& fxyz);
+    bool compute3DFilterFromThree1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, const hoNDArray<double>& fz, hoNDArray<GT_Complex16>& fxyz);
+
+    // data: in kspace, [RO E1 E2 CHA SLC CON PHS REP SET]
+    bool kspacefilterRO(hoNDArray<T>& data, const hoNDArray<T>& fRO);
+    bool kspacefilterRO(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fROE1, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered);
+
+    // kspace fitler for ISMRMRD dimension order
+    bool kspacefilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+
+    // kspace filter for the array whose first three dimensions are RO, E1 and E2; 
+    bool kspace3DfilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered);
+    bool kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+
+    // ------------------------------------------------------------------------
+    // generate kspace filters
+    // ------------------------------------------------------------------------
+    // symmetric filter, used for image filtering
+    // sigma: for Gaussian, in the unit of pixel
+    // width: for Tukey filter etc., the length of transition band
+    bool generateSymmetricFilter(size_t len, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width);
+
+    // asymmetric filter, used for partial fourier/asymmetric echo filtering
+    // start, end: the data range
+    // tapered hanning filer is implemented for this
+    // if filterType==ISMRMRD_FILTER_NONE and densityComp==true, the 0-1-2 filter will be generated
+    // if filterType==ISMRMRD_FILTER_TAPERED_HANNING and the densityComp is true, the density compensation version of tapered filter will be generated
+    // where unacquired region has filter values 0 and symmetric region 1 and nonsymmetric region 2
+    // if densityComp==false, the one side tapered filter will be generated
+    bool generateAsymmetricFilter(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, size_t width, bool densityComp=false);
+
+    // generate ref data filter
+    bool generateSymmetricFilterForRef(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width);
+
+    // find the symmetric sampled region
+    bool findSymmetricSampledRegion(size_t start, size_t end, size_t center, size_t& startSym, size_t& endSym);
+
+    // compute the filter SNR unit scale factor
+    bool computeFilterSNRUnitScaleFactor(const hoNDArray<T>& filter, T& scalFactor);
+
+    // ------------------------------------------------------------------------
+    // detect sampled region
+    // ------------------------------------------------------------------------
+    // data : [RO E1 SLC E2 CON PHS REP SET] array
+    bool detectSampledRegion2D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1);
+    bool detectSampledRegion3D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2);
+
+    // ------------------------------------------------------------------------
+    // coil sensitivity
+    // ------------------------------------------------------------------------
+    // average kspace along the 4th dimension
+    // data: [RO E1 CHA N S ...]
+    // ave: [RO E1 CHA 1 S ... ]
+    // simple average
+    bool averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave);
+    // the sampled times are considered for averaging for E1 dimension
+    // sampledTimes: [E1 1], recording the number of sampled times for each lines
+    bool averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave, std::vector<size_t>& sampledTimes);
+
+    // average kspace along the 5th dimension
+    // data: [RO E1 E2 CHA N ...]
+    // ave: [RO E1 E2 CHA 1 ... ]
+    // simple average
+    bool averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave);
+    // the sampled times are considered for averaging for E1 and E2 dimension
+    // sampledTimes: [E1 E2], recording the number of sampled times for each lines
+    bool averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave, hoNDArray<size_t>& sampledTimes);
+
+    // sampled region along E1
+    // data: [RO E1 CHA N]
+    bool detectSampledRegionE1(const hoNDArray<T>& data, size_t& startE1, size_t& endE1);
+
+    // sampled times along E1, if not sampled, sampledTimes[e1] == 0
+    bool detectSampledTimesE1(const hoNDArray<T>& data4D, std::vector<size_t>& sampledTimes);
+
+    // sampled region along E1 and E2
+    // data: [RO E1 E2 CHA N]
+    bool detectSampledRegionE1E2(const hoNDArray<T>& data, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2);
+
+    // sampled times along E1 and E2, if not sampled, sampledTimes(e1, e2) == 0
+    // data5D: [RO E1 E2 CHA N]
+    bool detectSampledTimesE1E2(const hoNDArray<T>& data5D, hoNDArray<size_t>& sampledTimes);
+
+    // copy along E1
+    bool copyAlongE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startE1, size_t endE1);
+
+    // copy along RO and E1
+    bool copyAlongROE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1);
+
+    // copy along RO, E1 and E2
+    bool copyAlongROE1E2(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1, size_t startE2, size_t endE2);
+
+    // copy along RO and E1, but a transition band is used to make sure the smoothing transition on the dst kspace
+    // the transition band is achieved via the tapered hanning filter
+    bool copyAlongROE1TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, 
+            size_t startE1, size_t endE1, size_t transBandRO, size_t transBandE1);
+
+    // copy along RO, E1 and E2, but a transition band is used to make sure the smoothing transition on the dst kspace
+    // the transition band is achieved via the tapered hanning filter
+    // src, dst: [RO E1 E2 ...]
+    bool copyAlongROE1E2TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, 
+                                    size_t startRO, size_t endRO, 
+                                    size_t startE1, size_t endE1, 
+                                    size_t startE2, size_t endE2, 
+                                    size_t transBandRO, size_t transBandE1, 
+                                    size_t transBandE2);
+
+    // ------------------------------------------------------------------------
+    // ISMRMRDDIM related functions
+    // ------------------------------------------------------------------------
+    // get the dimension name
+    std::string getISMRMRDDimName(const ISMRMRDDIM& dim);
+    ISMRMRDDIM getISMRMRDDimFromName(const std::string& name);
+
+    // get the dimension order index in the ISMRMRD format
+    // this function is for the kspace
+    //  [Ro E1 Cha Slice E2 Con Phase Rep Set Seg]
+    //   0  1  2   3     4  5   6     7   8   9
+    bool getISMRMRDDimIndex(const ISMRMRDDIM& dim, long long& ind);
+
+    // find the dimension indexes
+    bool findDimIndex(const std::vector<DimensionRecordType>& dimStartingIndexes, ISMRMRDDIM dim, size_t ind);
+
+    // get recon algorithm from name
+    ISMRMRDALGO getISMRMRDReconAlgoFromName(const std::string& name);
+
+    // get coil map algorithm from name
+    ISMRMRDCOILMAPALGO getISMRMRDCoilMapAlgoFromName(const std::string& name);
+
+    // get the partial fourier/asymmetric echo handling algorithm from name
+    ISMRMRDPFALGO getISMRMRDPartialFourierReconAlgoFromName(const std::string& name);
+
+    // get the kspace filter algorithm from name
+    ISMRMRDKSPACEFILTER getISMRMRDKSpaceFilterFromName(const std::string& name);
+
+    // extract sub array for a dimension
+    // if lessEqual ==  true, [0:value] are extracted for dim
+    bool extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim, size_t value, bool lessEqual);
+    // if lessEqual ==  true, [0:value1 0:value2] are extracted for dim
+    bool extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2, bool lessEqual);
+
+    // extract sub array for two dimensions
+    // [0:value1] and [value2]
+    bool extractSubArrayForDim1LessEqualDim2Equal(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2);
+
+    // extract sub array limited by the max encoding counters
+    bool extractSubArrayForMaxEncodingCounters(const hoNDArray<T>& x, hoNDArray<T>& r, const ISMRMRD::EncodingCounters& maxIdx);
+
+    // ------------------------------------------------------------------------
+    // ISMRMRD acquisition header
+    // ------------------------------------------------------------------------
+    void clearAcquisitionHeaderISMRMRD(ISMRMRD::AcquisitionHeader& acqHeader);
+
+    // compute the image geometry for two acquisition header
+    bool hasIdenticalGeometryISMRMRD(const ISMRMRD::AcquisitionHeader& acqHeader1, const ISMRMRD::AcquisitionHeader& acqHeader2);
+
+    // add zeros pre/post data array
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    long long addPrePostZeros(size_t centre_column, size_t samples);
+
+    // find RO ranges from centre_column and number of samples
+    void findStartEndRO(size_t centre_column, size_t samples, long long& startRO, long long& endRO);
+
+    // find RO ranges from centre_column and number of samples after zero-filling
+    void findStartEndROAfterZeroFilling(size_t centre_column, size_t samples_zerofilled, int& startRO, int& endRO);
+
+    // ------------------------------------------------------------------------
+    // utility functions for various things
+    // ------------------------------------------------------------------------
+    // jobSchedule : for every valid device, it records the job allocated to it
+    // what is stored are valid device id and job packages allocated to it
+    // for one valid device, multiple job packages can be given to it
+    #ifdef USE_CUDA
+        bool cudaJobSplitter(const std::vector<unsigned int>& jobIDs, size_t jobSize, size_t minimalMemoryForValidDevice, std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > >& jobSchedule);
+        bool cudaJobSplitter(unsigned int numOfJobs, size_t jobSize, size_t minimalMemoryForValidDevice, std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > >& jobSchedule);
+    #endif // USE_CUDA
+};
+
+// utility functions only meaningful for complex data type
+template <typename T> 
+class gtPlusISMRMRDReconUtilComplex : public gtPlusISMRMRDReconUtil<T>
+{
+public:
+
+    gtPlusISMRMRDReconUtilComplex();
+    virtual ~gtPlusISMRMRDReconUtilComplex();
+
+    void printInfo(std::ostream& os);
+
+    // ------------------------------------------------------------------------
+    // noise prewhitening
+    // ------------------------------------------------------------------------
+    // compute the noise prewhitening matrix
+    // noise: the noise scan [RO E1 CHA]
+    // noiseBandWidth: the noise bandwidth, Hz/pixel
+    // receiverBWRatio: system receiver noise equivaluent bandwidth ratio
+    // ADCSamplingTimeinSecond: ADC sampling time in second
+    // prewhiteningMatrix: the computed noise prewhitening matrix [CHA CHA]
+    bool computeNoisePrewhiteningMatrix(const hoNDArray<T>& noise, double noiseBandWidth, double receiverBWRatio, double ADCSamplingTimeinSecond, hoMatrix<T>& prewhiteningMatrix);
+
+    // perform the noise prewhitening matrix on the image/ref data
+    // result = prewhiteningMatrix * data
+    // data should at least have three dimensions [R0 E1 CHA], up to 10D
+    bool performNoisePrewhitening(hoNDArray<T>& data, const hoMatrix<T>& prewhiteningMatrix);
+
+    // ------------------------------------------------------------------------
+    // zero-padding resize
+    // ------------------------------------------------------------------------
+    // zero padding resize for kspace and complex image
+    // data is first fft to kspace and then zero padding then ifft to image domain
+    // the scaling is handled to keep the noise variance
+    // data: the 1st and 2rd dimensions are resized
+    bool zpadResize2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized);
+    // data: [RO E1 SLC E2 CON PHS REP SET], 1, 2 and 4th dimensions are resized
+    bool zpadResize3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized);
+
+    // zero-padding resize with kspace as input
+    bool zpadResize2DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized);
+    bool zpadResize3DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized);
+
+    // zero padding resize with filter
+    // data is first fft to kspace, then zero padding, then filtered and ifft to image domain
+    // filter2D: 2D array for kspace filter
+    // data: the 1st and 2rd dimensions are resized
+    bool zpadResize2DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, const hoNDArray<T>& filter2D, hoNDArray<T>& dataResized);
+    // filter3D: 3D array for kspace filter
+    // data: [RO E1 SLC E2 CON PHS REP SET], 1, 2 and 4th dimensions are resized
+    bool zpadResize3DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, const hoNDArray<T>& filter3D, hoNDArray<T>& dataResized);
+
+    // ------------------------------------------------------------------------
+    // kspace filter in image domain
+    // ------------------------------------------------------------------------
+    // in image domain for ISMRMRD dimension order
+    bool kspacefilterROImage(hoNDArray<T>& data, const hoNDArray<T>& fRO);
+    bool kspacefilterROImage(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+    bool kspacefilterROE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered);
+
+    // ------------------------------------------------------------------------
+    // coil sensitivity
+    // ------------------------------------------------------------------------
+    // coil estimation using NIH method
+    // data: in image domain, at least 3D [RO E1 CHA], the coil map will be estimated for every 2D kspace
+    bool coilMap2DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks=11, size_t power=3, size_t iterNum=5, typename realType<T>::Type thres=1e-3, bool useGPU=true);
+    bool coilMap2DNIHGPU(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks=11, size_t power=3, size_t iterNum=5, typename realType<T>::Type thres=1e-3);
+
+    // data: in image domain, at least 4D [RO E1 E2 CHA], the coil map will be estimated for every 2D kspace [RO E1 CHA] across E2
+    bool coilMap3DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks=7, size_t power=3, size_t iterNum=5, typename realType<T>::Type thres=1e-3, bool true3D=false);
+    // a gpu version of coil map 3D estimation, this function should only be used for the full-res coil map estimation
+    // if gpu is not available, it calls coilMap3DNIH
+    bool coilMap3DNIHGPU_FullResMap(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks=7, size_t power=3, size_t iterNum=5, typename realType<T>::Type thres=1e-3, bool true3D=false);
+
+    // the Souheil method
+    // data: [RO E1 CHA], only 3D array
+    // these functions are using 2D data correlation matrix
+    bool coilMap2DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+    bool coilMap2DNIHInner_2(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+
+    // data: [RO E1 E2 CHA], this functions uses true 3D data correlation matrix
+    bool coilMap3DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power);
+
+    // sum of square coil combination
+    // data: in image domain, at least 3D [RO E1 CHA]
+    bool sumOfSquare(const hoNDArray<T>& data, hoNDArray<T>& sos);
+
+    // coil map weighted coil combination
+    // data: in image domain, at least 3D [RO E1 CHA ...]
+    // coilMap: [RO E1 CHA ... ]
+    bool coilCombine(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined);
+
+    // data: in image domain, at least 4D [RO E1 E2 CHA ...]
+    // coilMap: [RO E1 E2 CHA ... ]
+    bool coilCombine3D(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined);
+
+    // ------------------------------------------------------------------------
+    // kspace utility functions
+    // ------------------------------------------------------------------------
+    // get the conjugate symmetric kspace for 2D case
+    // kspace : [RO E1 ...]
+    // kspaceConj: [RO E1 ...]
+    bool conjugateSymmetry2D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj);
+
+    // kspace : [RO E1 E2 ...]
+    // kspaceConj: [RO E1 E2 ...]
+    bool conjugateSymmetry3D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj);
+};
+
+}}
+
+#include "gtPlusISMRMRDReconUtil.hxx"
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.hxx b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.hxx
new file mode 100644
index 0000000..f52f34c
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconUtil.hxx
@@ -0,0 +1,5645 @@
+
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+gtPlusISMRMRDReconUtil<T>::gtPlusISMRMRDReconUtil() {}
+
+template <typename T> 
+gtPlusISMRMRDReconUtil<T>::~gtPlusISMRMRDReconUtil() {}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD Recon Util -------------" << endl;
+    os << "Implementation of recon utilities for ISMRMRD format" << endl;
+    os << "------------------------------------------------------" << endl;
+}
+
+// ------------------------------------------------------------------------
+// coil compression and KarhunenLoeverTransform
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_eigenAnalysis(const hoMatrix<T>& data, hoMatrix<T>& eigenVectors, hoMatrix<T>& eigenValues)
+{
+    try
+    {
+        typedef typename realType<T>::Type ValueType;
+
+        size_t M = data.rows();
+        size_t N = data.cols();
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.createMatrix(N, N));
+        GADGET_CHECK_RETURN_FALSE(eigenValues.createMatrix(N, 1));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(eigenVectors, data, true, data, false));
+
+        //eigenVectors.print(std::cout);
+
+        hoMatrix<T> mean(N, 1);
+        GADGET_CHECK_RETURN_FALSE(data.sumOverCol(mean));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal((ValueType)1.0/M, mean));
+
+        //mean.print(std::cout);
+
+        hoMatrix<T> MMH(N, N);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(MMH, mean, false, mean, true));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal((ValueType)M, MMH));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(eigenVectors, MMH, eigenVectors));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal((ValueType)1.0/(M-1), eigenVectors));
+
+        //MMH.print(std::cout);
+        //eigenVectors.print(std::cout);
+
+        hoMatrix<T> EH(eigenVectors);
+        GADGET_CHECK_RETURN_FALSE(conjugatetrans(eigenVectors, EH));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(eigenVectors, EH, eigenVectors));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(0.5, eigenVectors));
+
+        //eigenVectors.print(std::cout);
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::EigenAnalysis_syev_heev2(eigenVectors, eigenValues));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::KLT_eigenAnalysis(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_applyEigen(const hoMatrix<T>& data, hoMatrix<T>& dataEigen, const hoMatrix<T>& eigenVectors)
+{
+    try
+    {
+        size_t M = data.rows();
+        size_t N = data.cols();
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.rows()==N);
+
+        size_t K = eigenVectors.cols();
+
+        GADGET_CHECK_RETURN_FALSE(dataEigen.createMatrix(M, K));
+
+        // M*N multiplies N*K
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(dataEigen, data, false, eigenVectors, false));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::KLT_applyEigen(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_applyEigen(const hoNDArray<T>& data, hoNDArray<T>& dataEigen, const hoMatrix<T>& eigenVectors)
+{
+    try
+    {
+        size_t M = data.get_size(0);
+        size_t N = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.rows()==N);
+
+        size_t K = eigenVectors.cols();
+
+        dataEigen.create(M, K);
+
+        hoNDArray<T> eigenVec(eigenVectors.get_dimensions(), const_cast<T*>(eigenVectors.begin()));
+
+        // M*N multiplies N*K
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(dataEigen, data, false, eigenVec, false));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::KLT_applyEigen(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+KLT_numberOfKeptModes(const hoMatrix<T>& eigenValues, double thres, long long& numOfModesKept)
+{
+    try
+    {
+        size_t M = eigenValues.rows();
+
+        if ( thres <= 0 )
+        {
+            numOfModesKept = (long long)M;
+            return true;
+        }
+
+        long long m;
+        for ( m=M-2; m>=0; m-- )
+        {
+            if ( std::abs(eigenValues(m,0)) < thres*std::abs(eigenValues(M-1,0)) )
+            {
+                break;
+            }
+        }
+
+        numOfModesKept = M - m -1;
+
+        if ( numOfModesKept <= 0 )
+        {
+            GADGET_WARN_MSG("KLT_numberOfKeptModes(...) - numOfModesKept <= 0 : " << thres);
+            GADGET_WARN_MSG("KLT_numberOfKeptModes(...) - keep all modes : " << M);
+            numOfModesKept = (long long)M;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::KLT_numberOfKeptModes(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+pruneEigenVectorMatrix(const hoMatrix<T>& eigenVectors, long long numOfModesKept, hoMatrix<T>& eigenVectorsPruned)
+{
+    try
+    {
+        size_t M = eigenVectors.rows();
+        size_t N = eigenVectors.cols();
+
+        if ( numOfModesKept<=0 || numOfModesKept>(long long)N )
+        {
+            GADGET_WARN_MSG("gtPlusISMRMRDReconUtil<T>::pruneEigenVectorMatrix(...) - numOfModesKept<=0 || numOfModesKept>N : " << numOfModesKept);
+            eigenVectorsPruned = eigenVectors;
+            return true;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(eigenVectorsPruned.createMatrix(M, numOfModesKept));
+        GADGET_CHECK_RETURN_FALSE(eigenVectors.subMatrix(eigenVectorsPruned, 0, M-1, N-numOfModesKept, N-1));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::pruneEigenVectorMatrix(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+computeKLTCoeff(const hoNDArray<T>& data, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoMatrix<T> eigenVectors;
+        hoMatrix<T> A;
+
+        if ( isChaLastDim )
+        {
+            size_t CHA = data.get_size(NDim-1);
+            size_t N = data.get_number_of_elements()/CHA;
+
+            GADGET_CHECK_RETURN_FALSE(A.createMatrix(N, CHA, const_cast<T*>(data.begin())));
+            GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+        }
+        else
+        {
+            size_t RO = data.get_size(0);
+            size_t E1 = data.get_size(1);
+            size_t CHA = data.get_size(2);
+
+            if ( NDim == 3 )
+            {
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1, CHA, const_cast<T*>(data.begin())));
+                GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+            }
+            else if ( NDim == 4 )
+            {
+                size_t N = data.get_size(3);
+                hoNDArray<T> dataP(RO, E1, N, CHA);
+                GADGET_CHECK_RETURN_FALSE(permuteLastTwoDimensions(data, dataP));
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1*N, CHA, dataP.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+            }
+            else if ( NDim >= 5 )
+            {
+                std::vector<size_t> dimOrder(NDim);
+                size_t l;
+                for ( l=0; l<NDim; l++ )
+                {
+                    dimOrder[l] = l;
+                }
+                dimOrder[2] = NDim-1;
+                dimOrder[NDim-1] = 2;
+
+                hoNDArray<T> dataP(data);
+                permute(&dataP, &dimOrder);
+
+                size_t num = data.get_number_of_elements()/CHA;
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(num, CHA, dataP.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, coeff, eigenValues));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::computeKLTCoeff(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompressionCoeff(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoMatrix<T> eigenVectors;
+        GADGET_CHECK_RETURN_FALSE(computeKLTCoeff(data, eigenVectors, eigenValues, isChaLastDim));
+
+        long long numOfModesKept;
+        GADGET_CHECK_RETURN_FALSE(KLT_numberOfKeptModes(eigenValues, thres, numOfModesKept));
+        GADGET_CHECK_RETURN_FALSE(pruneEigenVectorMatrix(eigenVectors, numOfModesKept, coeff));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompressionCoeff(thres) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompressionCoeff(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoMatrix<T> eigenVectors;
+        GADGET_CHECK_RETURN_FALSE(computeKLTCoeff(data, eigenVectors, eigenValues, isChaLastDim));
+        GADGET_CHECK_RETURN_FALSE(pruneEigenVectorMatrix(eigenVectors, numOfModesKept, coeff));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompressionCoeff(numOfModesKept) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompression(const hoNDArray<T>& data, double thres, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(computeKLCoilCompressionCoeff(data, thres, coeff, eigenValues, isChaLastDim));
+        GADGET_CHECK_RETURN_FALSE(appyKLCoilCompressionCoeff(data, coeff, dataEigen, isChaLastDim));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompression(thres) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+inline bool gtPlusISMRMRDReconUtil<T>::
+computeKLCoilCompression(const hoNDArray<T>& data, int numOfModesKept, hoMatrix<T>& coeff, hoMatrix<T>& eigenValues, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(computeKLCoilCompressionCoeff(data, numOfModesKept, coeff, eigenValues, isChaLastDim));
+        GADGET_CHECK_RETURN_FALSE(appyKLCoilCompressionCoeff(data, coeff, dataEigen, isChaLastDim));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::computeKLCoilCompression(numOfModesKept) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+appyKLCoilCompressionCoeff(const hoNDArray<T>& data, const hoMatrix<T>& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+
+        size_t dstCHA = coeff.cols();
+
+        // D = A * V
+        hoMatrix<T> A;
+        hoMatrix<T> D;
+
+        if ( isChaLastDim )
+        {
+            size_t CHA = data.get_size(NDim-1);
+            size_t N = data.get_number_of_elements()/CHA;
+
+            hoNDArray<T> A_tmp(N, CHA, const_cast<T*>(data.begin()));
+            // GADGET_CHECK_RETURN_FALSE(A.createMatrix(CHA, N, const_cast<T*>(data.begin())));
+
+            std::vector<size_t> dimEigen(*dim);
+            dimEigen[NDim-1] = dstCHA;
+            dataEigen.create(&dimEigen);
+
+            hoNDArray<T> D_tmp(N, dstCHA, dataEigen.begin());
+            // GADGET_CHECK_RETURN_FALSE(D.createMatrix(dstCHA, N, dataEigen.begin()));
+
+            GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A_tmp, D_tmp, coeff));
+        }
+        else
+        {
+            size_t RO = data.get_size(0);
+            size_t E1 = data.get_size(1);
+            size_t CHA = data.get_size(2);
+
+            if ( NDim == 3 )
+            {
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1, CHA, const_cast<T*>(data.begin())));
+
+                dataEigen.create(RO, E1, dstCHA);
+                GADGET_CHECK_RETURN_FALSE(D.createMatrix(RO*E1, dstCHA, dataEigen.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A, D, coeff));
+            }
+            else if ( NDim == 4 )
+            {
+                size_t N = data.get_size(3);
+                hoNDArray<T> dataP(RO, E1, N, CHA);
+                GADGET_CHECK_RETURN_FALSE(permuteLastTwoDimensions(data, dataP));
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(RO*E1*N, CHA, dataP.begin()));
+
+                hoNDArray<T> dataEigenP(RO, E1, N, dstCHA);
+                GADGET_CHECK_RETURN_FALSE(D.createMatrix(RO*E1*N, dstCHA, dataEigenP.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A, D, coeff));
+
+                GADGET_CHECK_RETURN_FALSE(permuteLastTwoDimensions(dataEigenP, dataEigen));
+            }
+            else if ( NDim >= 5 )
+            {
+                std::vector<size_t> dimOrder(NDim);
+                size_t l;
+                for ( l=0; l<NDim; l++ )
+                {
+                    dimOrder[l] = l;
+                }
+                dimOrder[2] = NDim-1;
+                dimOrder[NDim-1] = 2;
+
+                boost::shared_ptr< hoNDArray<T> > dataP = permute(const_cast< hoNDArray<T>* >(&data), &dimOrder);
+
+                size_t num = data.get_number_of_elements()/CHA;
+                GADGET_CHECK_RETURN_FALSE(A.createMatrix(num, CHA, dataP->begin()));
+
+                boost::shared_ptr< std::vector<size_t> > dimP = dataP->get_dimensions();
+                (*dimP)[NDim-1] = dstCHA;
+
+                dataEigen.create(dimP);
+                GADGET_CHECK_RETURN_FALSE(D.createMatrix(num, dstCHA, dataEigen.begin()));
+
+                GADGET_CHECK_RETURN_FALSE(KLT_applyEigen(A, D, coeff));
+
+                dataP = permute(&dataEigen, &dimOrder);
+                dataEigen =  *dataP;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::appyKLCoilCompressionCoeff(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+applyKLCoilCompressionCoeff(const hoNDArray<T>& data, const std::vector<hoMatrix<T> >& coeff, hoNDArray<T>& dataEigen, bool isChaLastDim)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        GADGET_CHECK_RETURN_FALSE(coeff.size()>=data.get_size(NDim-1));
+
+        size_t LastDim = coeff.size();
+        size_t dstCHA = coeff[0].cols();
+
+        size_t n;
+        for ( n=1; n<LastDim; n++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(coeff[n].cols()==dstCHA);
+        }
+
+        size_t LastDimData = data.get_size(NDim-1);
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        long long N = data.get_number_of_elements()/LastDimData;
+
+        std::vector<size_t> dimEigen(*dim);
+
+        if ( isChaLastDim )
+        {
+            dimEigen[NDim-2] = dstCHA;
+        }
+        else
+        {
+            dimEigen[2] = dstCHA;
+        }
+
+        dataEigen.create(&dimEigen);
+        long long eigenN = dataEigen.get_number_of_elements()/LastDimData;
+
+        std::vector<size_t> dimLastDim(NDim-1);
+        for ( n=0; n<NDim-1; n++ )
+        {
+            dimLastDim[n] = (*dim)[n];
+        }
+
+        hoNDArray<T> dataEigenLastDim;
+        for ( n=0; n<LastDimData; n++ )
+        {
+            hoNDArray<T> dataLastDim(&dimLastDim, const_cast<T*>(data.begin()+n*N));
+            GADGET_CHECK_RETURN_FALSE(appyKLCoilCompressionCoeff(dataLastDim, coeff[n], dataEigenLastDim, isChaLastDim));
+            memcpy(dataEigen.begin()+n*eigenN, dataEigenLastDim.begin(), dataEigenLastDim.get_number_of_bytes());
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::applyKLCoilCompressionCoeff(std::vector<hoMatrix<T> >& coeff) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::computeKLFilter(const hoNDArray<T>& data, size_t numOfModesKept, hoNDArray<T>& dataKLF)
+{
+    try
+    {
+        if ( !dataKLF.dimensions_equal(&data) )
+        {
+            dataKLF = data;
+        }
+
+        size_t NDim = data.get_number_of_dimensions();
+        size_t M = data.get_size(NDim-1);
+        size_t N = data.get_number_of_elements()/M;
+
+        if ( numOfModesKept > M ) numOfModesKept = M;
+
+        hoMatrix<T> A(N, M, const_cast<T*>(data.begin()));
+
+        hoMatrix<T> eigenVectors, eigenValues;
+        GADGET_CHECK_RETURN_FALSE(KLT_eigenAnalysis(A, eigenVectors, eigenValues));
+
+        hoMatrix<T> E(eigenVectors);
+        size_t r, c;
+        for ( c=0; c<M-numOfModesKept+1; c++ )
+        {
+            for ( r=0; r<M; r++ )
+            {
+                E(r, c) = T(0);
+            }
+        }
+
+        hoMatrix<T> ET(eigenVectors);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::conjugatetrans(eigenVectors, ET));
+
+        hoMatrix<T> EET(M, M);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(EET, E, false, ET, false));
+
+        hoMatrix<T> R(N, M, dataKLF.begin());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(R, A, false, EET, false));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::computeKLFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// zero-padding resize
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+zpadRange(size_t srcSize, size_t dstSize, size_t& start, size_t& end)
+{
+    try
+    {
+        if ( srcSize >= dstSize )
+        {
+            start = 0;
+            end = srcSize-1;
+            return true;
+        }
+
+        //unsigned srcCenterInd = srcSize/2;
+        //unsigned dstCenterInd = dstSize/2;
+
+        start = (dstSize/2) - (srcSize/2);
+        end = srcSize + start -1;
+
+        //start = std::floor((double)dstSize/2.0)+1+std::ceil(-1.0 * (double)srcSize/2.0)-1;
+        //end = std::floor((double)dstSize/2.0)+std::ceil(srcSize/2.0)-1;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::zpadRange(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+zeropad2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataPadded)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataPadded = data;
+            return true;
+        }
+
+        size_t sRO, eRO, sE1, eE1;
+        GADGET_CHECK_RETURN_FALSE(zpadRange(RO, sizeX, sRO, eRO));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(E1, sizeY, sE1, eE1));
+
+        boost::shared_ptr< std::vector<size_t> > dimPadded = data.get_dimensions();
+        (*dimPadded)[0] = sizeX;
+        (*dimPadded)[1] = sizeY;
+        dataPadded.create(dimPadded);
+        Gadgetron::clear(&dataPadded);
+
+        size_t num = data.get_number_of_elements()/(RO*E1);
+
+        long long n;
+
+        const T* pData = data.begin();
+        T* pDataPadded = dataPadded.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(num, sE1, eE1, sRO, RO, E1, pData, pDataPadded, sizeX, sizeY)
+        for ( n=0; n<(long long)num; n++ )
+        {
+            for ( size_t y=sE1; y<=eE1; y++ )
+            {
+                memcpy(pDataPadded+n*sizeX*sizeY+y*sizeX+sRO, pData+n*RO*E1+(y-sE1)*RO, sizeof(T)*RO);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::zeropad2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+zeropad3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataPadded)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataPadded = data;
+            return true;
+        }
+
+        size_t sRO, eRO, sE1, eE1, sE2, eE2;
+        GADGET_CHECK_RETURN_FALSE(zpadRange(RO, sizeX, sRO, eRO));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(E1, sizeY, sE1, eE1));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(E2, sizeZ, sE2, eE2));
+
+        boost::shared_ptr< std::vector<size_t> > dimPadded = data.get_dimensions();
+        (*dimPadded)[0] = sizeX;
+        (*dimPadded)[1] = sizeY;
+        (*dimPadded)[2] = sizeZ;
+        dataPadded.create(dimPadded);
+        Gadgetron::clear(&dataPadded);
+
+        size_t num = data.get_number_of_elements()/(RO*E1*E2);
+
+        long long n;
+
+        const T* pData = data.begin();
+        T* pDataPadded = dataPadded.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(num, sE2, eE2, sE1, eE1, sRO, RO, E1, E2, pData, pDataPadded, sizeX, sizeY, sizeZ)
+        for ( n=0; n<(long long)num; n++ )
+        {
+            T* pDst = pDataPadded+n*sizeX*sizeY*sizeZ;
+            T* pSrc = const_cast<T*>(pData)+n*RO*E1*E2;
+
+            long long z;
+            // #pragma omp parallel for default(none) private(z) shared(pDst, pSrc, sE2, eE2, sE1, eE1, sRO, RO, E1, E2, sizeX, sizeY, sizeZ) num_threads(2)
+            for ( z=sE2; z<=eE2; z++ )
+            {
+                long long o1 = z*sizeX*sizeY + sRO;
+                long long o2 = (z-sE2)*RO*E1;
+                for ( size_t y=sE1; y<=eE1; y++ )
+                {
+                    memcpy(pDst+o1+y*sizeX, pSrc+o2+(y-sE1)*RO, sizeof(T)*RO);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::zeropad3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+zeropad3DNoPresetZeros(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataPadded)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t srcCHA = data.get_size(3);
+        size_t dstCHA = data.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataPadded = data;
+            return true;
+        }
+
+        size_t sRO, eRO, sE1, eE1, sE2, eE2;
+        GADGET_CHECK_RETURN_FALSE(zpadRange(RO, sizeX, sRO, eRO));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(E1, sizeY, sE1, eE1));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(E2, sizeZ, sE2, eE2));
+
+        GADGET_CHECK_RETURN_FALSE(dataPadded.get_size(0)==sizeX);
+        GADGET_CHECK_RETURN_FALSE(dataPadded.get_size(1)==sizeY);
+        GADGET_CHECK_RETURN_FALSE(dataPadded.get_size(2)==sizeZ);
+        GADGET_CHECK_RETURN_FALSE(dataPadded.get_size(3)==srcCHA);
+        GADGET_CHECK_RETURN_FALSE(dataPadded.get_size(4)==dstCHA);
+
+        size_t num = data.get_number_of_elements()/(RO*E1*E2);
+
+        long long n;
+
+        const T* pData = data.begin();
+        T* pDataPadded = dataPadded.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(num, sE2, eE2, sE1, eE1, sRO, RO, E1, E2, pData, pDataPadded, sizeX, sizeY, sizeZ)
+        for ( n=0; n<(long long)num; n++ )
+        {
+            T* pDst = pDataPadded+n*sizeX*sizeY*sizeZ;
+            T* pSrc = const_cast<T*>(pData)+n*RO*E1*E2;
+
+            long long z;
+            //#pragma omp parallel for default(none) private(z) shared(pDst, pSrc, sE2, eE2, sE1, eE1, sRO, RO, E1, E2, sizeX, sizeY, sizeZ) num_threads(2)
+            for ( z=sE2; z<=eE2; z++ )
+            {
+                long long o1 = z*sizeX*sizeY + sRO;
+                long long o2 = (z-sE2)*RO*E1;
+                for ( size_t y=sE1; y<=eE1; y++ )
+                {
+                    memcpy(pDst+o1+y*sizeX, pSrc+o2+(y-sE1)*RO, sizeof(T)*RO);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::zeropad3DNoPresetZeros(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+cutpad2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataCut)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX<=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY<=E1);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataCut = data;
+            return true;
+        }
+
+        size_t sRO, eRO, sE1, eE1;
+        GADGET_CHECK_RETURN_FALSE(zpadRange(sizeX, RO, sRO, eRO));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(sizeY, E1, sE1, eE1));
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        (*dim)[0] = sizeX;
+        (*dim)[1] = sizeY;
+        dataCut.create(dim);
+
+        size_t num = data.get_number_of_elements()/(RO*E1);
+
+        long long n;
+
+        const T* pData = data.begin();
+        T* pDataCut = dataCut.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(num, sE1, eE1, sRO, RO, E1, pData, pDataCut, sizeX, sizeY)
+        for ( n=0; n<(long long)num; n++ )
+        {
+            for ( size_t y=sE1; y<=eE1; y++ )
+            {
+                memcpy(pDataCut+n*sizeX*sizeY+(y-sE1)*sizeX, pData+n*RO*E1+y*RO+sRO, sizeof(T)*sizeX);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::cutpad2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+cutpad3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataCut)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX<=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY<=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ<=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataCut = data;
+            return true;
+        }
+
+        size_t sRO, eRO, sE1, eE1, sE2, eE2;
+        GADGET_CHECK_RETURN_FALSE(zpadRange(sizeX, RO, sRO, eRO));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(sizeY, E1, sE1, eE1));
+        GADGET_CHECK_RETURN_FALSE(zpadRange(sizeZ, E2, sE2, eE2));
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        (*dim)[0] = sizeX;
+        (*dim)[1] = sizeY;
+        (*dim)[2] = sizeZ;
+        dataCut.create(dim);
+
+        size_t num = data.get_number_of_elements()/(RO*E1*E2);
+
+        long long n;
+
+        const T* pData = data.begin();
+        T* pDataCut = dataCut.begin();
+
+        #pragma omp parallel for default(none) private(n) shared(num, sE2, eE2, sE1, eE1, sRO, RO, E1, E2, pData, pDataCut, sizeX, sizeY, sizeZ)
+        for ( n=0; n<(long long)num; n++ )
+        {
+            for ( size_t z=sE2; z<=eE2; z++ )
+            {
+                for ( size_t y=sE1; y<=eE1; y++ )
+                {
+                    memcpy(pDataCut+n*sizeX*sizeY*sizeZ+(z-sE2)*sizeX*sizeY+(y-sE1)*sizeX, pData+n*RO*E1*E2+z*RO*E1+y*RO+sRO, sizeof(T)*sizeX);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::cutpad3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// kspace filter
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute2DFilterFromTwo1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, hoNDArray<T>& fxy)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+
+        fxy.create(RO, E1);
+        T* pFxy = fxy.begin();
+
+        size_t x, y;
+
+        for ( y=0; y<E1; y++ )
+        {
+            for ( x=0; x<RO; x++ )
+            {
+                pFxy[y*RO+x] = fx(x) * fy(y);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::compute2DFilterFromTwo1D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute2DFilterFromTwo1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, hoNDArray<GT_Complex8>& fxy)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+
+        fxy.create(RO, E1);
+        GT_Complex8* pFxy = fxy.begin();
+
+        size_t x, y;
+
+        for ( y=0; y<E1; y++ )
+        {
+            for ( x=0; x<RO; x++ )
+            {
+                pFxy[y*RO+x] = GT_Complex8(fx(x) * fy(y));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::compute2DFilterFromTwo1D(float) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute2DFilterFromTwo1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, hoNDArray<GT_Complex16>& fxy)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+
+        fxy.create(RO, E1);
+        GT_Complex16* pFxy = fxy.begin();
+
+        size_t x, y;
+
+        for ( y=0; y<E1; y++ )
+        {
+            for ( x=0; x<RO; x++ )
+            {
+                pFxy[y*RO+x] = GT_Complex16(fx(x) * fy(y));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::compute2DFilterFromTwo1D(double) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute3DFilterFromThree1D(const hoNDArray<T>& fx, const hoNDArray<T>& fy, const hoNDArray<T>& fz, hoNDArray<T>& fxyz)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+        size_t E2 = fz.get_size(0);
+
+        fxyz.create(RO, E1, E2);
+        T* pFxyz = fxyz.begin();
+
+        const T* px = fx.begin();
+        const T* py = fy.begin();
+        const T* pz = fz.begin();
+
+        size_t x, y, z;
+
+        T vz, vy, vx;
+
+        size_t ind = 0;
+        for ( z=0; z<E2; z++ )
+        {
+            vz = pz[z];
+            for ( y=0; y<E1; y++ )
+            {
+                vy = py[y];
+                for ( x=0; x<RO; x++ )
+                {
+                    vx = px[x];
+                    pFxyz[ind] = (vx*vz*vy);
+                    ind++;
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::compute3DFilterFromThree1D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute3DFilterFromThree1D(const hoNDArray<float>& fx, const hoNDArray<float>& fy, const hoNDArray<float>& fz, hoNDArray<GT_Complex8>& fxyz)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+        size_t E2 = fz.get_size(0);
+
+        fxyz.create(RO, E1, E2);
+        GT_Complex8* pFxyz = fxyz.begin();
+
+        size_t x, y, z;
+
+        for ( z=0; z<E2; z++ )
+        {
+            for ( y=0; y<E1; y++ )
+            {
+                for ( x=0; x<RO; x++ )
+                {
+                    pFxyz[z+RO*E1+y*RO+x] = GT_Complex8(fx(x)*fy(y)*fz(z));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::compute3DFilterFromThree1D(float) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+compute3DFilterFromThree1D(const hoNDArray<double>& fx, const hoNDArray<double>& fy, const hoNDArray<double>& fz, hoNDArray<GT_Complex16>& fxyz)
+{
+    try
+    {
+        size_t RO = fx.get_size(0);
+        size_t E1 = fy.get_size(0);
+        size_t E2 = fz.get_size(0);
+
+        fxyz.create(RO, E1, E2);
+        GT_Complex16* pFxyz = fxyz.begin();
+
+        size_t x, y, z;
+
+        for ( z=0; z<E2; z++ )
+        {
+            for ( y=0; y<E1; y++ )
+            {
+                for ( x=0; x<RO; x++ )
+                {
+                    pFxyz[z+RO*E1+y*RO+x] = GT_Complex16(fx(x)*fy(y)*fz(z));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::compute3DFilterFromThree1D(double) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterRO(hoNDArray<T>& data, const hoNDArray<T>& fRO)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fRO, data, data));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterRO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterRO(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fRO, data, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterRO(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fROE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)*data.get_size(1)==fROE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fROE1, data, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+
+        hoNDArray<T> fxy;
+        compute2DFilterFromTwo1D(fRO, fE1, fxy);
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fxy, data, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterE1(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fxy;
+        compute2DFilterFromTwo1D(fRO, fE1, fxy);
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fxy, data, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        if ( data.get_size(2)==1 && data.get_size(3)==1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fROE1E2, data, dataFiltered));
+        }
+        else
+        {
+            size_t NDim = data.get_number_of_dimensions();
+            std::vector<size_t> order(data.get_number_of_dimensions(), 1);
+
+            size_t ii;
+            for ( ii=0; ii<NDim; ii++ )
+            {
+                order[ii] = ii;
+            }
+
+            order[0] = 0;
+            order[1] = 1;
+            order[2] = 4;
+            order[3] = 2;
+            order[4] = 3;
+
+            boost::shared_ptr< hoNDArray<T> > data_permuted = Gadgetron::permute(const_cast<hoNDArray<T>*>(&data), &order);
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fROE1E2, data, dataFiltered));
+
+            order[0] = 0;
+            order[1] = 1;
+            order[2] = 3;
+            order[3] = 4;
+            order[4] = 2;
+
+            data_permuted = Gadgetron::permute(&dataFiltered, &order);
+            dataFiltered = *data_permuted;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspacefilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(4)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspacefilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspacefilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterE2(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterROE2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fE1(data.get_size(1));
+        fE1.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterROE2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fRO(data.get_size(0));
+        fRO.fill(T(1.0));
+
+        hoNDArray<T> fxyz;
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fROE1E2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fROE1E2.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fROE1E2.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fROE1E2.get_size(2));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(fROE1E2, data, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+kspace3DfilterROE1E2(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        hoNDArray<T> fxyz(fRO.get_number_of_elements(), fE1.get_number_of_elements(), fE2.get_number_of_elements());
+        compute3DFilterFromThree1D(fRO, fE1, fE2, fxyz);
+
+        GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(data, fxyz, dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::kspace3DfilterROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// compute kspace filters
+// ------------------------------------------------------------------------
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+generateSymmetricFilter(size_t len, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width)
+{
+    try
+    {
+        if ( len == 0 ) return true;
+
+        filter.create(len);
+        Gadgetron::fill(&filter, T(1.0));
+
+        if ( width==0 || width>=len ) width = 1;
+
+        size_t ii;
+        switch (filterType)
+        {
+            case ISMRMRD_FILTER_GAUSSIAN:
+                {
+                    double r = -1.0*sigma*sigma/2;
+
+                    if ( len%2 == 0 )
+                    {
+                        // to make sure the zero points match and boundary of filters are symmetric
+                        double stepSize = 2.0/(len-2);
+                        std::vector<double> x(len-1);
+
+                        for ( ii=0; ii<len-1; ii++ )
+                        {
+                            x[ii] = -1 + ii*stepSize;
+                        }
+
+                        for ( ii=0; ii<len-1; ii++ )
+                        {
+                            filter(ii+1) = T( std::exp(r*(x[ii]*x[ii])) );
+                        }
+
+                        filter(0) = T(0);
+                    }
+                    else
+                    {
+                        double stepSize = 2.0/(len-1);
+                        std::vector<double> x(len);
+
+                        for ( ii=0; ii<len; ii++ )
+                        {
+                            x[ii] = -1 + ii*stepSize;
+                        }
+
+                        for ( ii=0; ii<len; ii++ )
+                        {
+                            filter(ii) = T( std::exp(r*(x[ii]*x[ii])) );
+                        }
+                    }
+                }
+            break;
+
+            case ISMRMRD_FILTER_TAPERED_HANNING:
+                 {
+                    hoNDArray<T> w(width);
+
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        w(ii-1) = T(0.5 * ( 1 - std::cos( 2.0*GT_PI*ii/(2*width+1) ) ));
+                    }
+
+                    if ( len%2 == 0 )
+                    {
+                        for ( ii=1; ii<=width; ii++ )
+                        {
+                            filter(ii) = w(ii-1);
+                            filter(len-ii) = filter(ii);
+                        }
+
+                        filter(0) = T(0);
+                    }
+                    else
+                    {
+                        for ( ii=1; ii<=width; ii++ )
+                        {
+                            filter(ii-1) = w(ii-1);
+                            filter(len-ii) = filter(ii-1);
+                        }
+                    }
+                }
+            break;
+
+            // symmetric hanning
+            //  does not include the first and last zero sample
+            case ISMRMRD_FILTER_HANNING:
+                 {
+                    if ( len%2 == 0 )
+                    {
+                        size_t N = len-1;
+                        double halfLen = (N+1)/2;
+                        for ( ii=1; ii<=halfLen; ii++ )
+                        {
+                            filter(ii) = T(0.5 * ( 1 - std::cos( 2.0*GT_PI*ii/(N+1) ) ));
+                        }
+
+                        for ( ii=halfLen; ii<N; ii++ )
+                        {
+                            filter(ii+1) = filter(N-ii);
+                        }
+
+                        filter(0) = T(0);
+                    }
+                    else
+                    {
+                        double halfLen = (len+1)/2;
+                        for ( ii=1; ii<=halfLen; ii++ )
+                        {
+                            filter(ii-1) = T(0.5 * ( 1 - std::cos( 2.0*GT_PI*ii/(len+1) ) ));
+                        }
+
+                        for ( ii=halfLen; ii<len; ii++ )
+                        {
+                            filter(ii) = filter(len-1-ii);
+                        }
+                    }
+                }
+            break;
+
+            default:
+            break;
+        }
+
+        T sos = 0.0f;
+        for ( ii=0; ii<len; ii++ )
+        {
+            sos += filter(ii)*filter(ii);
+        }
+        T r = 1.0/std::sqrt( std::abs(sos)/len );
+        for ( ii=0; ii<len; ii++ )
+        {
+            filter(ii) *= r;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::generateSymmetricFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+generateAsymmetricFilter(size_t len, size_t start, size_t end, hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, size_t width, bool densityComp)
+{
+    try
+    {
+        if ( len == 0 ) return true;
+
+        if ( start > len-1 ) start = 0;
+        if ( end > len-1 ) end = len-1;
+
+        if ( start > end )
+        {
+            start = 0;
+            end = len-1;
+        }
+
+        filter.create(len);
+        Gadgetron::fill(&filter, T(0.0));
+
+        size_t ii;
+        for ( ii=start; ii<=end; ii++ )
+        {
+            filter(ii) = T(1.0);
+        }
+
+        if ( width==0 || width>=len ) width = 1;
+
+        hoNDArray<T> w(width);
+
+        switch (filterType)
+        {
+            case ISMRMRD_FILTER_TAPERED_HANNING:
+                 {
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        w(ii-1) = T(0.5 * ( 1 - std::cos( 2.0*GT_PI*ii/(2*width+1) ) ));
+                    }
+                }
+            break;
+
+            default:
+                Gadgetron::fill(&w, T(1.0));
+            break;
+        }
+
+        if ( densityComp )
+        {
+            size_t startSym(0), endSym(len-1);
+            GADGET_CHECK_RETURN_FALSE(findSymmetricSampledRegion(start, end, len/2, startSym, endSym));
+
+            if ( start==0 && end==len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(ii-1) = w(ii-1);
+                    filter(len-ii) = filter(ii-1);
+                }
+            }
+
+            if ( start==0 && end<len-1 )
+            {
+                for ( ii=0; ii<startSym; ii++ )
+                {
+                    filter(ii) = 2.0;
+                }
+
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(ii-1+startSym) = T(1.0) + w(width-ii);
+                    filter(end-ii+1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end==len-1 )
+            {
+                for ( ii=endSym+1; ii<len; ii++ )
+                {
+                    filter(ii) = 2.0;
+                }
+
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(endSym-ii+1) = T(1.0) + w(width-ii);
+                    filter(start+ii-1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end<len-1 )
+            {
+                if ( start==startSym && end==endSym )
+                {
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(start+ii-1) = w(ii-1);
+                        filter(end-ii+1) = w(ii-1);
+                    }
+                }
+                else if ( start==startSym && end>endSym )
+                {
+                    for ( ii=endSym+1; ii<=end; ii++ )
+                    {
+                        filter(ii) = 2.0;
+                    }
+
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(end-ii+1) = T(1.0) + w(ii-1);
+                        filter(endSym-ii+1) = w(width-ii);
+                        filter(start+ii-1) = w(ii-1);
+                    }
+                }
+                else if ( start<startSym && end==endSym )
+                {
+                    for ( ii=start; ii<startSym; ii++ )
+                    {
+                        filter(ii) = 2.0;
+                    }
+
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(ii-1+start) = T(1.0) + w(ii-1);
+                        filter(ii-1+startSym) = w(width-ii);
+                        filter(end-ii+1) = w(ii-1);
+                    }
+                }
+                else
+                {
+                    for ( ii=1; ii<=width; ii++ )
+                    {
+                        filter(start+ii-1) = w(ii-1);
+                        filter(end-ii+1) = w(ii-1);
+                    }
+                }
+            }
+        }
+        else
+        {
+            if ( start==0 && end==len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(ii-1) = w(ii-1);
+                    filter(len-ii) = filter(ii-1);
+                }
+            }
+
+            if ( start==0 && end<len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(end-ii+1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end==len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(start+ii-1) = w(ii-1);
+                }
+            }
+
+            if ( start>0 && end<len-1 )
+            {
+                for ( ii=1; ii<=width; ii++ )
+                {
+                    filter(start+ii-1) = w(ii-1);
+                    filter(end-ii+1) = w(ii-1);
+                }
+            }
+        }
+
+        T sos = 0.0f;
+        for ( ii=0; ii<len; ii++ )
+        {
+            sos += filter(ii)*filter(ii);
+        }
+        T r = 1.0/std::sqrt( std::abs(sos)/len );
+        for ( ii=0; ii<len; ii++ )
+        {
+            filter(ii) *= r;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::generateAsymmetricFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+generateSymmetricFilterForRef(size_t len, size_t start, size_t end, 
+        hoNDArray<T>& filter, ISMRMRDKSPACEFILTER filterType, double sigma, size_t width)
+{
+    try
+    {
+        if ( len < 2 ) return true;
+
+        GADGET_CHECK_RETURN_FALSE(start>=0&&end<=len-1&&start<=end);
+
+        if ( start==0 && end==len-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateSymmetricFilter(len, filter, filterType, sigma, width));
+            return true;
+        }
+
+        size_t centerInd = len/2;
+
+        size_t lenFilter(0); // make a symmetric filter with zero at the center
+        size_t lenFilterEnd = 2*(end-centerInd)+1;
+        size_t lenFilterStart = 2*(centerInd-start)+1;
+
+        if ( start==0 && end<len-1 )
+        {
+            lenFilter = lenFilterEnd;
+        }
+        else if ( start>0 && end==len-1 )
+        {
+            lenFilter = lenFilterStart;
+        }
+        else if ( start>0 && end<len-1 )
+        {
+            lenFilter = GT_MIN(lenFilterStart, lenFilterEnd);
+        }
+        else
+        {
+            GADGET_ERROR_MSG("Invalid inputs : start - end - len : " << start << " " << end << " " << len);
+        }
+
+        GADGET_CHECK_RETURN_FALSE(lenFilter>0);
+
+        hoNDArray<T> filterSym(lenFilter);
+        GADGET_CHECK_RETURN_FALSE(generateSymmetricFilter(lenFilter, filterSym, filterType, sigma, width));
+
+        filter.create(len);
+        Gadgetron::clear(&filter);
+
+        if ( start==0 && end<len-1 )
+        {
+            memcpy(filter.begin()+end-lenFilter+1, filterSym.begin(), filterSym.get_number_of_bytes());
+            return true;
+        }
+        else if ( start>0 && end==len-1 )
+        {
+            memcpy(filter.begin()+start, filterSym.begin(), filterSym.get_number_of_bytes());
+            return true;
+        }
+        else if ( start>0 && end<len-1 )
+        {
+            if ( lenFilter == lenFilterStart ) 
+            {
+                memcpy(filter.begin()+start, filterSym.begin(), filterSym.get_number_of_bytes());
+            }
+            else
+            {
+                memcpy(filter.begin()+end-lenFilter+1, filterSym.begin(), filterSym.get_number_of_bytes());
+            }
+
+            return true;
+        }
+        else
+        {
+            GADGET_ERROR_MSG("Invalid inputs : start - end - len : " << start << " " << end << " " << len);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::generateSymmetricFilterForRef(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+findSymmetricSampledRegion(size_t start, size_t end, size_t center, size_t& startSym, size_t& endSym)
+{
+    GADGET_CHECK_RETURN_FALSE(end>=start);
+    GADGET_CHECK_RETURN_FALSE(center>=start);
+    GADGET_CHECK_RETURN_FALSE(end>=center);
+
+    size_t halfSizeStart = center - start;
+    size_t halfSizeEnd =  end - center;
+
+    if ( halfSizeStart > halfSizeEnd )
+    {
+        startSym = center - halfSizeEnd;
+        endSym = center + halfSizeEnd;
+    }
+    else
+    {
+        startSym = center - halfSizeStart;
+        endSym = center + halfSizeStart;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::computeFilterSNRUnitScaleFactor(const hoNDArray<T>& filter, T& scalFactor)
+{
+    size_t ii, len;
+
+    len = filter.get_number_of_elements();
+    if ( len == 0 )
+    {
+        scalFactor = T(1.0);
+        return true;
+    }
+
+    T sos(0.0);
+    for ( ii=0; ii<len; ii++ )
+    {
+        sos += filter(ii)*filter(ii);
+    }
+
+    scalFactor = T(1.0/std::sqrt( std::abs(sos)/len ));
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegion2D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+
+        hoNDArray<typename realType<T>::Type> mag(data.get_dimensions()), magSum, magSumE1, magSumRO;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(data, mag));
+
+        if ( NDim > 2 )
+        {
+            size_t ii;
+            for ( ii=0; ii<NDim-2; ii++ )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(mag, magSum));
+                mag = magSum;
+            }
+        }
+
+        size_t RO = mag.get_size(0);
+        size_t E1 = mag.get_size(1);
+
+        startRO = RO-1;
+        endRO = 0;
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        size_t ro, e1;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver2ndDimension(mag, magSumE1));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver1stDimension(mag, magSumRO));
+
+        for ( ro=0; ro<RO; ro++ )
+        {
+            if ( magSumE1(ro) > 0 )
+            {
+                if ( ro < startRO ) startRO = ro;
+                if ( ro > endRO ) endRO = ro;
+            }
+        }
+
+        for ( e1=0; e1<E1; e1++ )
+        {
+            if ( magSumRO(e1) > 0 )
+            {
+                if ( e1 < startE1 ) startE1 = e1;
+                if ( e1 > endE1 ) endE1 = e1;
+            }
+        }
+
+        if ( startRO > endRO )
+        {
+            startRO = 0;
+            endRO = RO-1;
+        }
+
+        if ( startE1 > endE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegion2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegion3D(const hoNDArray<T>& data, size_t& startRO, size_t& endRO, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+
+        hoNDArray<typename realType<T>::Type> mag(data.get_dimensions()), magSum, magSum2, magSumRO, magSumE1, magSumE2;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(data, mag));
+
+        if ( NDim > 5 )
+        {
+            size_t ii;
+            for ( ii=0; ii<NDim-5; ii++ )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(mag, magSum));
+                mag = magSum;
+            }
+        }
+
+        size_t RO = mag.get_size(0);
+        size_t E1 = mag.get_size(1);
+        size_t E2 = mag.get_size(4);
+
+        startRO = RO-1;
+        endRO = 0;
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        startE2 = E2-1;
+        endE2 = 0;
+
+        size_t ro, e1, e2;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver5thDimension(mag, magSum2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(magSum2, magSum));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver3rdDimension(magSum, magSum2));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver2ndDimension(magSum2, magSumE1));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver1stDimension(magSum2, magSumRO));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(mag, magSum2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver3rdDimension(magSum2, magSum));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver2ndDimension(magSum, magSum2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver1stDimension(magSum2, magSumE2));
+
+        for ( ro=0; ro<RO; ro++ )
+        {
+            if ( magSumE1(ro) > 0 )
+            {
+                if ( ro < startRO ) startRO = ro;
+                if ( ro > endRO ) endRO = ro;
+            }
+        }
+
+        for ( e1=0; e1<E1; e1++ )
+        {
+            if ( magSumRO(e1) > 0 )
+            {
+                if ( e1 < startE1 ) startE1 = e1;
+                if ( e1 > endE1 ) endE1 = e1;
+            }
+        }
+
+        for ( e2=0; e2<E2; e2++ )
+        {
+            if ( magSumE2(e2) > 0 )
+            {
+                if ( e2 < startE2 ) startE2 = e2;
+                if ( e2 > endE2 ) endE2 = e2;
+            }
+        }
+
+        if ( startRO > endRO )
+        {
+            startRO = 0;
+            endRO = RO-1;
+        }
+
+        if ( startE1 > endE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+
+        if ( startE2 > endE2 )
+        {
+            startE2 = 0;
+            endE2 = E2-1;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegion3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+// ------------------------------------------------------------------------
+// coil sensitivity
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(data, ave));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)(1.0/data.get_size(3)), ave));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace4D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace4D(const hoNDArray<T>& data, hoNDArray<T>& ave, std::vector<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        if ( NDim < 4 )
+        {
+            ave = data;
+            GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1(data, sampledTimes));
+            return true;
+        }
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+        size_t N = data.get_size(3);
+
+        hoNDArray<T> data4D(RO, E1, CHA, N, const_cast<T*>(data.begin()));
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1(data4D, sampledTimes));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(data, ave));
+
+        boost::shared_ptr< std::vector<size_t> > dim = ave.get_dimensions();
+
+        if ( dim->size() != NDim )
+        {
+            (*dim).insert((*dim).begin()+3, 1);
+            ave.reshape(dim.get());
+        }
+
+        hoNDArray<T> sampledTimes2D(RO, E1);
+        T* pTimes = sampledTimes2D.begin();
+        size_t ro, e1;
+        for ( e1=0; e1<E1; e1++ )
+        {
+            double t = sampledTimes[e1];
+            if ( t == 0 ) t = 1;
+
+            for ( ro=0; ro<RO; ro++ )
+            {
+                pTimes[e1*RO+ro] = T(1.0/t);
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(multipleMultiply(sampledTimes2D, ave, ave));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace4D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver5thDimension(data, ave));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal( (typename realType<T>::Type)(1.0/data.get_size(4)), ave));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace5D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+averageKSpace5D(const hoNDArray<T>& data, hoNDArray<T>& ave, hoNDArray<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        if ( NDim < 5 )
+        {
+            ave = data;
+            GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1E2(data, sampledTimes));
+            return true;
+        }
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t CHA = data.get_size(3);
+        size_t N = data.get_size(4);
+
+        hoNDArray<T> data5D(RO, E1, E2, CHA, N, const_cast<T*>(data.begin()));
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1E2(data5D, sampledTimes));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver5thDimension(data, ave));
+
+        hoNDArray<T> sampledTimes3D(RO, E1, E2);
+        T* pTimes = sampledTimes3D.begin();
+        size_t ro, e1, e2;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                double t = sampledTimes(e1+e2*E1);
+                if ( t == 0 ) t = 1;
+
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    pTimes[e2*RO*E1+e1*RO+ro] = T(1.0/t);
+                }
+            }
+        }
+
+        GADGET_CHECK_RETURN_FALSE(multipleMultiply(sampledTimes3D, ave, ave));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::averageKSpace5D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledTimesE1(const hoNDArray<T>& data4D, std::vector<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data4D.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = data4D.get_size(0);
+        size_t E1 = data4D.get_size(1);
+        size_t CHA = data4D.get_size(2);
+        size_t N = data4D.get_size(3);
+
+        hoNDArray<typename realType<T>::Type> mag(data4D.get_dimensions());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(data4D, mag));
+
+        hoNDArray<typename realType<T>::Type> mag3D(RO, E1, 1, N);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver3rdDimension(mag, mag3D));
+
+        hoNDArray<typename realType<T>::Type> mag2D(1, E1, 1, N);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver1stDimension(mag3D, mag2D));
+        typename realType<T>::Type* pMag2D = mag2D.begin();
+
+        sampledTimes.resize(E1, 0);
+
+        size_t e1, n;
+        for ( e1=0; e1<E1; e1++ )
+        {
+            for ( n=0; n<N; n++ )
+            {
+                if ( pMag2D[e1+n*E1] > 0 )
+                {
+                    sampledTimes[e1]++;
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledTimesE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegionE1(const hoNDArray<T>& data, size_t& startE1, size_t& endE1)
+{
+    try
+    {
+        std::vector<size_t> sampledTimes;
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1(data, sampledTimes));
+
+        size_t E1 = sampledTimes.size();
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        for ( size_t e1=0; e1<E1; e1++ )
+        {
+            if ( sampledTimes[e1] > 0 )
+            {
+                if ( e1 > endE1 ) endE1 = e1;
+                if ( e1 < startE1 ) startE1 = e1;
+            }
+        }
+
+        if ( endE1 < startE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegionE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledTimesE1E2(const hoNDArray<T>& data5D, hoNDArray<size_t>& sampledTimes)
+{
+    try
+    {
+        size_t NDim = data5D.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        size_t RO = data5D.get_size(0);
+        size_t E1 = data5D.get_size(1);
+        size_t E2 = data5D.get_size(2);
+        size_t CHA = data5D.get_size(3);
+        size_t N = data5D.get_size(4);
+
+        hoNDArray<typename realType<T>::Type> mag(RO, E1, E2);
+
+        hoNDArray<T> dataFirstChannel(RO, E1, E2, const_cast<T*>(data5D.begin()));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(dataFirstChannel, mag));
+
+        //hoNDArray<typename realType<T>::Type> mag4D(RO, E1, E2, 1, N);
+        //GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(mag, mag4D));
+
+        hoNDArray<typename realType<T>::Type> mag3D(1, E1, E2);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver1stDimension(mag, mag3D));
+
+        typename realType<T>::Type* pMag3D = mag3D.begin();
+
+        sampledTimes.create(E1, E2);
+        Gadgetron::clear(sampledTimes);
+        size_t* pTimes = sampledTimes.get_data_ptr();
+
+        size_t e1, e2, n;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( n=0; n<N; n++ )
+                {
+                    if ( pMag3D[e1+e2*E1+n*E1*E2] > 0 )
+                    {
+                        pTimes[e1+e2*E1]++;
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledTimesE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+detectSampledRegionE1E2(const hoNDArray<T>& data, size_t& startE1, size_t& endE1, size_t& startE2, size_t& endE2)
+{
+    try
+    {
+        hoNDArray<size_t> sampledTimes;
+        GADGET_CHECK_RETURN_FALSE(detectSampledTimesE1E2(data, sampledTimes));
+
+        size_t E1 = sampledTimes.get_size(0);
+        size_t E2 = sampledTimes.get_size(1);
+
+        startE1 = E1-1;
+        endE1 = 0;
+
+        startE2 = E2-1;
+        endE2 = 0;
+
+        size_t e1, e2;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                if ( sampledTimes(e1+e2*E1) > 0 )
+                {
+                    if ( e1 > endE1 ) endE1 = e1;
+                    if ( e1 < startE1 ) startE1 = e1;
+
+                    if ( e2 > endE2 ) endE2 = e2;
+                    if ( e2 < startE2 ) startE2 = e2;
+                }
+            }
+        }
+
+        if ( endE1 < startE1 )
+        {
+            startE1 = 0;
+            endE1 = E1-1;
+        }
+
+        if ( endE2 < startE2 )
+        {
+            startE2 = 0;
+            endE2 = E2-1;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::detectSampledRegionE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startE1, size_t endE1)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongE1(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        size_t N = dst.get_number_of_elements()/(RO*E1);
+
+        size_t n, e1;
+        for ( n=0; n<N; n++ )
+        {
+            for ( e1=startE1; e1<=endE1; e1++ )
+            {
+                memcpy(dst.begin()+n*RO*E1+e1*RO, src.begin()+n*RO*E1+e1*RO, sizeof(T)*RO);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        size_t N = dst.get_number_of_elements()/(RO*E1);
+        const T* pSrc = src.begin();
+        T* pDst = dst.begin();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pSrc, pDst, RO, E1, startRO, endRO, startE1, endE1)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            for ( size_t e1=startE1; e1<=endE1; e1++ )
+            {
+                size_t offset = n*RO*E1+e1*RO+startRO;
+                memcpy(pDst+offset, pSrc+offset, sizeof(T)*(endRO-startRO+1));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1E2(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, size_t startE1, size_t endE1, size_t startE2, size_t endE2)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+        size_t E2 = dst.get_size(2);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+        size_t E2_src = src.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(E2==E2_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1E2(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1E2(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        if ( (startE2>=E2) || (endE2>=E2) || (startE2>endE2) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1E2(...) : (startE2>=E2) || (endE2>=E2) || (startE2>endE2) ... ");
+            return true;
+        }
+
+        size_t N = dst.get_number_of_elements()/(RO*E1*E2);
+        const T* pSrc = src.begin();
+        T* pDst = dst.begin();
+
+        long long n;
+
+        #pragma omp parallel for default(none) private(n) shared(N, pSrc, pDst, RO, E1, E2, startRO, endRO, startE1, endE1, startE2, endE2)
+        for ( n=0; n<(long long)N; n++ )
+        {
+            for ( size_t e2=startE2; e2<=endE2; e2++ )
+            {
+                for ( size_t e1=startE1; e1<=endE1; e1++ )
+                {
+                    size_t offset = n*RO*E1*E2+e2*E1*RO+e1*RO+startRO;
+                    memcpy(pDst+offset, pSrc+offset, sizeof(T)*(endRO-startRO+1));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1E2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, 
+        size_t startE1, size_t endE1, size_t transBandRO, size_t transBandE1)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=2);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1TransitionBand(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1TransitionBand(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        while ( transBandRO>1 && startRO+transBandRO > RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandRO>1 && endRO-transBandRO < RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandE1>1 && startE1+transBandE1 > E1/2 )
+        {
+             transBandE1--;
+        }
+
+        while ( transBandE1>1 && endE1-transBandE1 < E1/2 )
+        {
+             transBandE1--;
+        }
+
+        ISMRMRDKSPACEFILTER filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+        bool densityComp = false;
+
+        hoNDArray<T> filter_src_RO, filter_src_E1;
+
+        if ( startRO==0 && endRO==RO-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, ISMRMRD_FILTER_NONE, transBandRO, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, filterType, transBandRO, densityComp));
+        }
+
+        if ( startE1==0 && endE1==E1-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, ISMRMRD_FILTER_NONE, transBandE1, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, filterType, transBandE1, densityComp));
+        }
+
+        // in this way, the SNR unit scale property is perserved
+        T midValue = filter_src_RO(RO/2);
+        T scalFactor = T(1.0)/midValue;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scalFactor, filter_src_RO));
+
+        midValue = filter_src_E1(E1/2);
+        scalFactor = T(1.0)/midValue;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scalFactor, filter_src_E1));
+
+        hoNDArray<T> filter_dst_RO(RO), filter_dst_E1(E1);
+
+        size_t ii;
+        for ( ii=0; ii<RO; ii++ )
+        {
+            filter_dst_RO(ii) = T(1.0) - filter_src_RO(ii);
+        }
+
+        for ( ii=0; ii<E1; ii++ )
+        {
+            filter_dst_E1(ii) = T(1.0) - filter_src_E1(ii);
+        }
+
+        hoNDArray<T> srcFiltered(src), dstFiltered(dst);
+        if ( startRO==0 && endRO==RO-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(src, filter_src_E1, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(dst, filter_dst_E1, dstFiltered));
+        }
+        else if ( startE1==0 && endE1==E1-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(src, filter_src_RO, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(dst, filter_dst_RO, dstFiltered));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterROE1(src, filter_src_RO, filter_src_E1, srcFiltered));
+
+            hoNDArray<T> fxy;
+            GADGET_CHECK_RETURN_FALSE(compute2DFilterFromTwo1D(filter_src_RO, filter_src_E1, fxy));
+
+            size_t Nxy = RO*E1;
+            for ( ii=0; ii<Nxy; ii++ )
+            {
+                fxy(ii) = T(1.0) - fxy(ii);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(kspacefilterROE1(dst, fxy, dstFiltered));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(srcFiltered, dstFiltered, dst));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1TransitionBand(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+copyAlongROE1E2TransitionBand(const hoNDArray<T>& src, hoNDArray<T>& dst, size_t startRO, size_t endRO, 
+                        size_t startE1, size_t endE1, size_t startE2, size_t endE2, 
+                        size_t transBandRO, size_t transBandE1, size_t transBandE2)
+{
+    try
+    {
+        size_t NDim = src.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        size_t RO = dst.get_size(0);
+        size_t E1 = dst.get_size(1);
+        size_t E2 = dst.get_size(2);
+
+        size_t RO_src = src.get_size(0);
+        size_t E1_src = src.get_size(1);
+        size_t E2_src = src.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(RO==RO_src);
+        GADGET_CHECK_RETURN_FALSE(E1==E1_src);
+        GADGET_CHECK_RETURN_FALSE(E2==E2_src);
+        GADGET_CHECK_RETURN_FALSE(src.get_number_of_elements()==dst.get_number_of_elements());
+
+        if ( (startRO>=RO) || (endRO>=RO) || (startRO>endRO) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1TransitionBand(...) : (startRO>=RO) || (endRO>=RO) || (startRO>endRO) ... ");
+            return true;
+        }
+
+        if ( (startE1>=E1) || (endE1>=E1) || (startE1>endE1) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1TransitionBand(...) : (startE1>=E1) || (endE1>=E1) || (startE1>endE1) ... ");
+            return true;
+        }
+
+        if ( (startE2>=E2) || (endE2>=E2) || (startE2>endE2) )
+        {
+            dst = src;
+            GADGET_WARN_MSG("copyAlongROE1E2TransitionBand(...) : (startE2>=E2) || (endE2>=E2) || (startE2>endE2) ... ");
+            return true;
+        }
+
+        while ( transBandRO>1 && startRO+transBandRO > RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandRO>1 && endRO-transBandRO < RO/2 )
+        {
+             transBandRO--;
+        }
+
+        while ( transBandE1>1 && startE1+transBandE1 > E1/2 )
+        {
+             transBandE1--;
+        }
+
+        while ( transBandE1>1 && endE1-transBandE1 < E1/2 )
+        {
+             transBandE1--;
+        }
+
+        while ( transBandE2>1 && startE2+transBandE2 > E2/2 )
+        {
+             transBandE2--;
+        }
+
+        while ( transBandE2>1 && endE2-transBandE2 < E2/2 )
+        {
+             transBandE2--;
+        }
+
+        ISMRMRDKSPACEFILTER filterType = ISMRMRD_FILTER_TAPERED_HANNING;
+        bool densityComp = false;
+
+        hoNDArray<T> filter_src_RO, filter_src_E1, filter_src_E2;
+
+        if ( startRO==0 && endRO==RO-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, ISMRMRD_FILTER_NONE, transBandRO, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(RO, startRO, endRO, filter_src_RO, filterType, transBandRO, densityComp));
+        }
+
+        if ( startE1==0 && endE1==E1-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, ISMRMRD_FILTER_NONE, transBandE1, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E1, startE1, endE1, filter_src_E1, filterType, transBandE1, densityComp));
+        }
+
+        if ( startE2==0 && endE2==E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E2, startE2, endE2, filter_src_E2, ISMRMRD_FILTER_NONE, transBandE2, densityComp));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(generateAsymmetricFilter(E2, startE2, endE2, filter_src_E2, filterType, transBandE2, densityComp));
+        }
+
+        // in this way, the SNR unit scale property is perserved
+        T midValue = filter_src_RO(RO/2);
+        T scalFactor = T(1.0)/midValue;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scalFactor, filter_src_RO));
+
+        midValue = filter_src_E1(E1/2);
+        scalFactor = T(1.0)/midValue;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scalFactor, filter_src_E1));
+
+        midValue = filter_src_E2(E2/2);
+        scalFactor = T(1.0)/midValue;
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scalFactor, filter_src_E2));
+
+        hoNDArray<T> filter_dst_RO(RO), filter_dst_E1(E1), filter_dst_E2(E2);
+
+        size_t ii;
+        for ( ii=0; ii<RO; ii++ )
+        {
+            filter_dst_RO(ii) = T(1.0) - filter_src_RO(ii);
+        }
+
+        for ( ii=0; ii<E1; ii++ )
+        {
+            filter_dst_E1(ii) = T(1.0) - filter_src_E1(ii);
+        }
+
+        for ( ii=0; ii<E2; ii++ )
+        {
+            filter_dst_E2(ii) = T(1.0) - filter_src_E2(ii);
+        }
+
+        hoNDArray<T> srcFiltered(src), dstFiltered(dst);
+        if ( startRO>=0 && endRO<=RO-1 && startE1==0 && endE1==E1-1 && startE2==0 && endE1==E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(src, filter_src_E1, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterRO(dst, filter_dst_E1, dstFiltered));
+        }
+        else if ( startRO==0 && endRO==RO-1 && startE1>=0 && endE1<=E1-1 && startE2==0 && endE1==E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(src, filter_src_RO, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspacefilterE1(dst, filter_dst_RO, dstFiltered));
+        }
+        else if ( startRO==0 && endRO==RO-1 && startE1==0 && endE1==E1-1 && startE2>=0 && endE1<=E2-1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterE2(src, filter_src_RO, srcFiltered));
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterE2(dst, filter_dst_RO, dstFiltered));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(src, filter_src_RO, filter_src_E1, filter_src_E2, srcFiltered));
+
+            hoNDArray<T> fxyz;
+            GADGET_CHECK_RETURN_FALSE(compute3DFilterFromThree1D(filter_src_RO, filter_src_E1, filter_src_E2, fxyz));
+
+            size_t Nxyz = RO*E1*E2;
+            for ( ii=0; ii<Nxyz; ii++ )
+            {
+                fxyz(ii) = T(1.0) - fxyz(ii);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(kspace3DfilterROE1E2(dst, fxyz, dstFiltered));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(srcFiltered, dstFiltered, dst));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::copyAlongROE1E2TransitionBand(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconUtil<T>::getISMRMRDDimName(const ISMRMRDDIM& dim)
+{
+    std::ostringstream os;
+    switch (dim)
+    {
+        case DIM_ReadOut:
+            os << "DIM_ReadOut";
+        break;
+
+        case DIM_Encoding1:
+            os << "DIM_Encoding1";
+        break;
+
+        case DIM_Channel:
+            os << "DIM_Channel";
+        break;
+
+        case DIM_Slice:
+            os << "DIM_Slice";
+        break;
+
+        case DIM_Encoding2:
+            os << "DIM_Encoding2";
+        break;
+
+        case DIM_Contrast:
+            os << "DIM_Contrast";
+        break;
+
+        case DIM_Phase:
+            os << "DIM_Phase";
+        break;
+
+        case DIM_Repetition:
+            os << "DIM_Repetition";
+        break;
+
+        case DIM_Set:
+            os << "DIM_Set";
+        break;
+
+        case DIM_Segment:
+            os << "DIM_Segment";
+        break;
+
+        case DIM_Average:
+            os << "DIM_Average";
+        break;
+
+        case DIM_other1:
+            os << "DIM_other1";
+        break;
+
+        case DIM_other2:
+            os << "DIM_other2";
+        break;
+
+        case DIM_other3:
+            os << "DIM_other3";
+        break;
+
+        default:
+            os << "DIM_NONE";
+    }
+
+    std::string dimStr(os.str());
+    return dimStr;
+}
+
+template <typename T> 
+ISMRMRDDIM gtPlusISMRMRDReconUtil<T>::getISMRMRDDimFromName(const std::string& name)
+{
+    if ( name == "DIM_ReadOut" ) return DIM_ReadOut;
+    if ( name == "DIM_Encoding1" ) return DIM_Encoding1;
+    if ( name == "DIM_Channel" ) return DIM_Channel;
+    if ( name == "DIM_Slice" ) return DIM_Slice;
+    if ( name == "DIM_Encoding2" ) return DIM_Encoding2;
+    if ( name == "DIM_Contrast" ) return DIM_Contrast;
+    if ( name == "DIM_Phase" ) return DIM_Phase;
+    if ( name == "DIM_Repetition" ) return DIM_Repetition;
+    if ( name == "DIM_Set" ) return DIM_Set;
+    if ( name == "DIM_Segment" ) return DIM_Segment;
+    if ( name == "DIM_Average" ) return DIM_Average;
+    if ( name == "DIM_other1" ) return DIM_other1;
+    if ( name == "DIM_other2" ) return DIM_other2;
+    if ( name == "DIM_other3" ) return DIM_other3;
+
+    return DIM_NONE;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::getISMRMRDDimIndex(const ISMRMRDDIM& dim, long long& ind)
+{
+    switch (dim)
+    {
+        case Gadgetron::gtPlus::DIM_ReadOut:
+            ind = 0;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Encoding1:
+            ind = 1;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Channel:
+            ind = 2;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Slice:
+            ind = 3;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Encoding2:
+            ind = 4;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Contrast:
+            ind = 5;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Phase:
+            ind = 6;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Repetition:
+            ind = 7;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Set:
+            ind = 8;
+        break;
+
+        case Gadgetron::gtPlus::DIM_Segment:
+            ind = 9;
+        break;
+
+        default:
+            ind = -1;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::findDimIndex(const std::vector<DimensionRecordType>& dimStartingIndexes, ISMRMRDDIM dim, size_t ind)
+{
+    size_t N = dimStartingIndexes.size();
+
+    size_t n;
+    for ( n=0; n<N; n++ )
+    {
+        if ( dimStartingIndexes[n].first == dim )
+        {
+            ind = dimStartingIndexes[n].second;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+template <typename T> 
+ISMRMRDALGO gtPlusISMRMRDReconUtil<T>::getISMRMRDReconAlgoFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_GRAPPA" ) return ISMRMRD_GRAPPA;
+    if ( name == "ISMRMRD_SENSE" ) return ISMRMRD_SENSE;
+    if ( name == "ISMRMRD_SPIRIT" ) return ISMRMRD_SPIRIT;
+    if ( name == "ISMRMRD_L1SPIRIT" ) return ISMRMRD_L1SPIRIT;
+    if ( name == "ISMRMRD_SOFTSENSE" ) return ISMRMRD_SOFTSENSE;
+    if ( name == "ISMRMRD_L1SOFTSENSE" ) return ISMRMRD_L1SOFTSENSE;
+
+    return ISMRMRD_NONE;
+}
+
+template <typename T> 
+ISMRMRDCOILMAPALGO gtPlusISMRMRDReconUtil<T>::getISMRMRDCoilMapAlgoFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_SOUHEIL" ) return ISMRMRD_SOUHEIL;
+    if ( name == "ISMRMRD_SOUHEIL_ITER" ) return ISMRMRD_SOUHEIL_ITER;
+
+    return ISMRMRD_SOUHEIL;
+}
+
+template <typename T> 
+ISMRMRDPFALGO gtPlusISMRMRDReconUtil<T>::getISMRMRDPartialFourierReconAlgoFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_PF_HOMODYNE" ) return ISMRMRD_PF_HOMODYNE;
+    if ( name == "ISMRMRD_PF_FENGHUANG" ) return ISMRMRD_PF_FENGHUANG;
+    if ( name == "ISMRMRD_PF_ZEROFILLING_FILTER" ) return ISMRMRD_PF_ZEROFILLING_FILTER;
+    if ( name == "ISMRMRD_PF_POCS" ) return ISMRMRD_PF_POCS;
+
+    return ISMRMRD_PF_ZEROFILLING;
+}
+
+template <typename T> 
+ISMRMRDKSPACEFILTER gtPlusISMRMRDReconUtil<T>::
+getISMRMRDKSpaceFilterFromName(const std::string& name)
+{
+    if ( name == "ISMRMRD_FILTER_GAUSSIAN" ) return ISMRMRD_FILTER_GAUSSIAN;
+    if ( name == "ISMRMRD_FILTER_HANNING" ) return ISMRMRD_FILTER_HANNING;
+    if ( name == "ISMRMRD_FILTER_TUKEY" ) return ISMRMRD_FILTER_TUKEY;
+    if ( name == "ISMRMRD_FILTER_TAPERED_HANNING" ) return ISMRMRD_FILTER_TAPERED_HANNING;
+    if ( name == "ISMRMRD_FILTER_NONE" ) return ISMRMRD_FILTER_NONE;
+
+    return ISMRMRD_FILTER_NONE;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim, size_t value, bool lessEqual)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        long long dimInd;
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim, dimInd));
+
+        GADGET_CHECK_RETURN_FALSE(value<(*dimX)[dimInd]);
+
+        std::vector<size_t> crop_offset(10, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+
+        std::vector<size_t> crop_size(10, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1];
+        crop_size[2] = (*dimX)[2];
+        crop_size[3] = (*dimX)[3];
+        crop_size[4] = (*dimX)[4];
+        crop_size[5] = (*dimX)[5];
+        crop_size[6] = (*dimX)[6];
+        crop_size[7] = (*dimX)[7];
+        crop_size[8] = (*dimX)[8];
+        crop_size[9] = (*dimX)[9];
+
+        if ( lessEqual )
+        {
+            crop_size[dimInd] = value+1;
+        }
+        else
+        {
+            crop_offset[dimInd] = value;
+            crop_size[dimInd] = 1;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo10DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForDim(dim, value) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForDim(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2, bool lessEqual)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        long long dimInd1, dimInd2;
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim1, dimInd1));
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim2, dimInd2));
+
+        GADGET_CHECK_RETURN_FALSE(value1<(*dimX)[dimInd1]);
+        GADGET_CHECK_RETURN_FALSE(value2<(*dimX)[dimInd2]);
+
+        std::vector<size_t> crop_offset(10, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+
+        std::vector<size_t> crop_size(10, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1];
+        crop_size[2] = (*dimX)[2];
+        crop_size[3] = (*dimX)[3];
+        crop_size[4] = (*dimX)[4];
+        crop_size[5] = (*dimX)[5];
+        crop_size[6] = (*dimX)[6];
+        crop_size[7] = (*dimX)[7];
+        crop_size[8] = (*dimX)[8];
+        crop_size[9] = (*dimX)[9];
+
+        if ( lessEqual )
+        {
+            crop_size[dimInd1] = value1+1;
+            crop_size[dimInd2] = value2+1;
+        }
+        else
+        {
+            crop_offset[dimInd1] = value1;
+            crop_size[dimInd1] = 1;
+
+            crop_offset[dimInd2] = value2;
+            crop_size[dimInd2] = 1;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo10DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForDim(dim1, value1, dim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForDim1LessEqualDim2Equal(const hoNDArray<T>& x, hoNDArray<T>& r, ISMRMRDDIM& dim1, size_t value1, ISMRMRDDIM& dim2, size_t value2)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        long long dimInd1, dimInd2;
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim1, dimInd1));
+        GADGET_CHECK_RETURN_FALSE(getISMRMRDDimIndex(dim2, dimInd2));
+
+        GADGET_CHECK_RETURN_FALSE(value1<(*dimX)[dimInd1]);
+        GADGET_CHECK_RETURN_FALSE(value2<(*dimX)[dimInd2]);
+
+        std::vector<size_t> crop_offset(10, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+
+        std::vector<size_t> crop_size(10, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1];
+        crop_size[2] = (*dimX)[2];
+        crop_size[3] = (*dimX)[3];
+        crop_size[4] = (*dimX)[4];
+        crop_size[5] = (*dimX)[5];
+        crop_size[6] = (*dimX)[6];
+        crop_size[7] = (*dimX)[7];
+        crop_size[8] = (*dimX)[8];
+        crop_size[9] = (*dimX)[9];
+
+        crop_size[dimInd1] = value1+1;
+
+        crop_offset[dimInd2] = value2;
+        crop_size[dimInd2] = 1;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo10DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForDim1LessEqualDim2Equal(dim1, value1, dim2, value2) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+extractSubArrayForMaxEncodingCounters(const hoNDArray<T>& x, hoNDArray<T>& r, const ISMRMRD::EncodingCounters& maxIdx)
+{
+    try
+    {
+        boost::shared_ptr< std::vector<size_t> > dimX = x.get_dimensions();
+
+        std::vector<size_t> crop_offset(10, 0);
+        crop_offset[0] = 0;
+        crop_offset[1] = 0;
+        crop_offset[2] = 0;
+        crop_offset[3] = 0;
+        crop_offset[4] = 0;
+        crop_offset[5] = 0;
+        crop_offset[6] = 0;
+        crop_offset[7] = 0;
+        crop_offset[8] = 0;
+        crop_offset[9] = 0;
+
+        // [RO E1 Cha Slice E2 Contrast Phase Rep Set Seg]
+        std::vector<size_t> crop_size(10, 0);
+        crop_size[0] = (*dimX)[0];
+        crop_size[1] = (*dimX)[1]; if ( maxIdx.kspace_encode_step_1 < crop_size[1]-1 ) crop_size[1] = maxIdx.kspace_encode_step_1+1;
+        crop_size[2] = (*dimX)[2]; 
+        crop_size[3] = (*dimX)[3]; if ( maxIdx.slice                < crop_size[3]-1 ) crop_size[3] = maxIdx.slice+1;
+        crop_size[4] = (*dimX)[4]; if ( maxIdx.kspace_encode_step_2 < crop_size[4]-1 ) crop_size[4] = maxIdx.kspace_encode_step_2+1;
+        crop_size[5] = (*dimX)[5]; if ( maxIdx.contrast             < crop_size[5]-1 ) crop_size[5] = maxIdx.contrast+1;
+        crop_size[6] = (*dimX)[6]; if ( maxIdx.phase                < crop_size[6]-1 ) crop_size[6] = maxIdx.phase+1;
+        crop_size[7] = (*dimX)[7]; if ( maxIdx.repetition           < crop_size[7]-1 ) crop_size[7] = maxIdx.repetition+1;
+        crop_size[8] = (*dimX)[8]; if ( maxIdx.set                  < crop_size[8]-1 ) crop_size[8] = maxIdx.set+1;
+        crop_size[9] = (*dimX)[9]; if ( maxIdx.segment              < crop_size[9]-1 ) crop_size[9] = maxIdx.segment+1;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo10DArray(x, r, crop_offset, crop_size));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::extractSubArrayForMaxEncodingCounters(const hoNDArray<T>& x, hoNDArray<T>& r, const ISMRMRD::EncodingCounters& maxIdx) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::clearAcquisitionHeaderISMRMRD(ISMRMRD::AcquisitionHeader& acqHeader)
+{
+    memset(&acqHeader, 0, sizeof(ISMRMRD::AcquisitionHeader));
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::hasIdenticalGeometryISMRMRD(const ISMRMRD::AcquisitionHeader& acqHeader1, const ISMRMRD::AcquisitionHeader& acqHeader2)
+{
+    long long ii;
+
+    for ( ii=0; ii<ISMRMRD_POSITION_LENGTH; ii++ )
+    {
+        if ( std::abs(acqHeader1.position[ii]-acqHeader2.position[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+        if ( std::abs(acqHeader1.patient_table_position[ii]-acqHeader2.patient_table_position[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+    }
+
+    for ( ii=0; ii<ISMRMRD_DIRECTION_LENGTH; ii++ )
+    {
+        if ( std::abs(acqHeader1.read_dir[ii]-acqHeader2.read_dir[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+        if ( std::abs(acqHeader1.phase_dir[ii]-acqHeader2.phase_dir[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+        if ( std::abs(acqHeader1.slice_dir[ii]-acqHeader2.slice_dir[ii]) > GT_IMAGING_GEOMETRY_DELTA ) return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+long long gtPlusISMRMRDReconUtil<T>::addPrePostZeros(size_t centre_column, size_t samples)
+{
+    // 1 : pre zeros
+    // 2 : post zeros
+    // 0 : no zeros
+    if ( 2*centre_column == samples )
+    {
+        return 0;
+    }
+
+    if ( 2*centre_column < samples )
+    {
+        return 1;
+    }
+
+    if ( 2*centre_column > samples )
+    {
+        return 2;
+    }
+
+    return 0;
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::findStartEndRO(size_t centre_column, size_t samples, long long& startRO, long long& endRO)
+{
+    long long zerosFlag = addPrePostZeros(centre_column, samples);
+
+    if ( zerosFlag == 0 )
+    {
+        startRO = 0;
+        endRO = (long long)samples-1;
+    }
+
+    if ( zerosFlag == 1 )
+    {
+        endRO = (long long)2*(samples-centre_column)-1;
+        startRO = (long long)endRO-samples+1;
+    }
+
+    if ( zerosFlag == 2 )
+    {
+        startRO = 0;
+        endRO = (long long)samples-1;
+    }
+
+    return;
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtil<T>::findStartEndROAfterZeroFilling(size_t centre_column, size_t samples_zerofilled, int& startRO, int& endRO)
+{
+    size_t num = samples_zerofilled/2;
+
+    if ( centre_column == num )
+    {
+        startRO = 0;
+        endRO = (int)samples_zerofilled-1;
+    }
+
+    if ( centre_column+num < samples_zerofilled ) // pre zeros
+    {
+        endRO = (int)samples_zerofilled-1;
+        startRO = endRO-(int)(centre_column+num)+1;
+    }
+
+    if ( centre_column+num > samples_zerofilled ) // post zeros
+    {
+        startRO = 0;
+        endRO = (int)samples_zerofilled-1;
+    }
+
+    return;
+}
+
+#ifdef USE_CUDA
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+cudaJobSplitter(const std::vector<unsigned int>& jobIDs, size_t jobSize, size_t minimalMemoryForValidDevice, 
+                std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > >& jobSchedule)
+{
+    try
+    {
+        unsigned int numOfJobs = jobIDs.size();
+        if ( numOfJobs == 0 )
+        {
+            GADGET_WARN_MSG("numOfJobs == 0");
+            return true;
+        }
+
+        // find valid device
+        int numOfDevices(0);
+        GADGET_CHECK_RETURN_FALSE(cudaGetDeviceCount( &numOfDevices )==cudaSuccess);
+
+        if ( numOfDevices == 0 )
+        {
+            GADGET_WARN_MSG("numOfDevices == 0");
+            return true;
+        }
+
+        std::vector<unsigned int> validDevices;
+        int d;
+        for ( d=0; d<numOfDevices; d++ )
+        {
+            size_t totalMem = cudaDeviceManager::Instance()->total_global_mem(d);
+            if ( totalMem >= minimalMemoryForValidDevice )
+            {
+                validDevices.push_back(d);
+            }
+        }
+
+        if ( validDevices.empty() )
+        {
+            GADGET_ERROR_MSG("No valid device can be found : " << minimalMemoryForValidDevice);
+            return false;
+        }
+
+        std::vector<unsigned int> maxJobN(validDevices.size());
+        for ( d=0; d<validDevices.size(); d++ )
+        {
+            size_t totalMem = cudaDeviceManager::Instance()->total_global_mem(validDevices[d]);
+            maxJobN[d] = totalMem/jobSize;
+        }
+
+        jobSchedule.clear();
+
+        size_t job = 0;
+        unsigned int validDevice = 0;
+        while ( job < numOfJobs )
+        {
+            size_t start = job;
+            size_t end = job + maxJobN[validDevice] - 1;
+
+            if ( end >= numOfJobs ) end = numOfJobs - 1;
+
+            unsigned int deviceID = validDevices[validDevice];
+
+            unsigned int loc;
+            for ( loc=0; loc<jobSchedule.size(); loc++ )
+            {
+                if ( jobSchedule[loc].first == deviceID ) break;
+            }
+
+            if ( loc < jobSchedule.size() )
+            {
+                // insert a new job package
+                std::vector<unsigned int> jobPackage;
+                for ( unsigned int jj=start; jj<=end; jj++ )
+                {
+                    jobPackage.push_back(jobIDs[jj]);
+                }
+
+                jobSchedule[loc].second.push_back(jobPackage);
+            }
+            else
+            {
+                // create a new entry
+                std::pair<unsigned int, std::vector<std::vector<unsigned int> > > jobItem;
+                jobItem.first = deviceID;
+
+                std::vector<unsigned int> jobPackage;
+                for ( unsigned int jj=start; jj<=end; jj++ )
+                {
+                    jobPackage.push_back(jobIDs[jj]);
+                }
+                jobItem.second.push_back(jobPackage);
+
+                jobSchedule.push_back(jobItem);
+            }
+
+            job = end+1;
+            validDevice++;
+
+            if ( validDevice >= validDevices.size() )
+            {
+                validDevice = 0;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtil<T>::cudaJobSplitter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtil<T>::
+cudaJobSplitter(unsigned int numOfJobs, size_t jobSize, size_t minimalMemoryForValidDevice, 
+            std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > >& jobSchedule)
+{
+    if ( numOfJobs == 0 )
+    {
+        GADGET_WARN_MSG("numOfJobs == 0");
+        return true;
+    }
+
+    std::vector<unsigned int> jobIDs(numOfJobs, 0);
+    unsigned int ii;
+    for ( ii=0; ii<numOfJobs; ii++ ) jobIDs[ii] = ii;
+    return cudaJobSplitter(jobIDs, jobSize, minimalMemoryForValidDevice, jobSchedule);
+}
+
+#endif // USE_CUDA
+
+// ========================================================================================== //
+
+template <typename T> 
+gtPlusISMRMRDReconUtilComplex<T>::gtPlusISMRMRDReconUtilComplex() {}
+
+template <typename T> 
+gtPlusISMRMRDReconUtilComplex<T>::~gtPlusISMRMRDReconUtilComplex() {}
+
+template <typename T> 
+void gtPlusISMRMRDReconUtilComplex<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+    os << "-------------- GTPlus ISMRMRD Recon Util Complex -------------" << endl;
+    os << "Implementation of recon utilities for ISMRMRD complex data type" << endl;
+    os << "--------------------------------------------------------------" << endl;
+}
+
+// ------------------------------------------------------------------------
+// noise prewhitening
+// ------------------------------------------------------------------------
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+computeNoisePrewhiteningMatrix(const hoNDArray<T>& noise, double noiseBandWidth, double receiverBWRatio, double ADCSamplingTimeinSecond, hoMatrix<T>& prewhiteningMatrix)
+{
+    try
+    {
+        size_t RO = noise.get_size(0);
+        size_t E1 = noise.get_size(1);
+        size_t CHA = noise.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(prewhiteningMatrix.createMatrix(CHA, CHA));
+
+        typedef typename realType<T>::Type ValueType;
+
+        // noise sampling time in second
+        ValueType noiseSamplingTimeinSecond = (ValueType)(1.0/(noiseBandWidth*RO));
+
+        // scaling factor
+        ValueType scaling = (ValueType)(noiseSamplingTimeinSecond/ADCSamplingTimeinSecond/receiverBWRatio);
+        scaling /= (RO*E1-1);
+
+        // compute the noise covariance matrix
+        hoMatrix<T> R(RO*E1, CHA, const_cast<T*>(noise.begin()));
+
+        // R'*R --> CHA by CHA covariance matrix
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::GeneralMatrixProduct_gemm(prewhiteningMatrix, R, true, R, false));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scaling, prewhiteningMatrix));
+
+        // 0.5*(R+R')
+        hoMatrix<T> RH(prewhiteningMatrix);
+        GADGET_CHECK_RETURN_FALSE(conjugatetrans(prewhiteningMatrix, RH));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::add(prewhiteningMatrix, RH, prewhiteningMatrix));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(0.5, prewhiteningMatrix));
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::CholeskyHermitianPositiveDefinite_potrf(prewhiteningMatrix, 'L'));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::TriangularInverse_trtri(prewhiteningMatrix, 'L'));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(std::sqrt((double)2.0), prewhiteningMatrix));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::computeNoisePrewhiteningMatrix(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+performNoisePrewhitening(hoNDArray<T>& data, const hoMatrix<T>& prewhiteningMatrix)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(prewhiteningMatrix.rows()==CHA);
+        GADGET_CHECK_RETURN_FALSE(prewhiteningMatrix.cols()==CHA);
+
+        size_t N = data.get_number_of_elements()/(RO*E1*CHA);
+
+        long long n;
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(n) shared(RO, E1, CHA, N)
+        #else
+            #pragma omp parallel default(none) private(n) shared(RO, E1, CHA, N, data, prewhiteningMatrix)
+        #endif // GCC_OLD_FLAG
+        {
+            hoMatrix<T> tmp(RO*E1, CHA);
+
+            #pragma omp for
+            for ( n=0; n<(long long)N; n++ )
+            {
+                hoMatrix<T> D(RO*E1, CHA, data.begin()+n*RO*E1*CHA);
+                Gadgetron::GeneralMatrixProduct_gemm(tmp, D, false, prewhiteningMatrix, false);
+                memcpy(data.begin()+n*RO*E1*CHA, tmp.begin(), sizeof(T)*RO*E1*CHA);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::performNoisePrewhitening(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize2D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY )
+        {
+            dataResized.create(sizeX, sizeY);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(data, kspace));
+        GADGET_CHECK_RETURN_FALSE(zpadResize2DOnKSpace(kspace, sizeX, sizeY, dataResized));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize2D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize2DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataResized = kspace;
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataResized));
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY )
+        {
+            dataResized.create(sizeX, sizeY);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        GADGET_CHECK_RETURN_FALSE(this->zeropad2D(kspace, sizeX, sizeY, dataResized));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataResized));
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY)/std::sqrt((double)RO*E1));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scaling, dataResized));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize2DOnKSpace(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize3D(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY || dataResized.get_size(2)!=sizeZ )
+        {
+            dataResized.create(sizeX, sizeY, sizeZ);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, kspace));
+        GADGET_CHECK_RETURN_FALSE(zpadResize3DOnKSpace(kspace, sizeX, sizeY, sizeZ, dataResized));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize3DOnKSpace(const hoNDArray<T>& kspace, size_t sizeX, size_t sizeY, size_t sizeZ, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataResized = kspace;
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataResized));
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY || dataResized.get_size(2)!=sizeZ )
+        {
+            dataResized.create(sizeX, sizeY, sizeZ);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        GADGET_CHECK_RETURN_FALSE(this->zeropad3D(kspace, sizeX, sizeY, sizeZ, dataResized));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataResized));
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY*sizeZ)/std::sqrt((double)RO*E1*E2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scaling, dataResized));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize3DOnKSpace(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize2DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, const hoNDArray<T>& filter2D, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+
+        GADGET_CHECK_RETURN_FALSE(filter2D.get_size(0)==sizeX);
+        GADGET_CHECK_RETURN_FALSE(filter2D.get_size(1)==sizeY);
+
+        if ( RO==sizeX && E1==sizeY )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY )
+        {
+            dataResized.create(sizeX, sizeY);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(data, kspace));
+        GADGET_CHECK_RETURN_FALSE(this->zeropad2D(kspace, sizeX, sizeY, dataResized));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(filter2D, dataResized, dataResized));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataResized));
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY)/std::sqrt((double)RO*E1));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scaling, dataResized));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize2DFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+zpadResize3DFilter(const hoNDArray<T>& data, size_t sizeX, size_t sizeY, size_t sizeZ, const hoNDArray<T>& filter3D, hoNDArray<T>& dataResized)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+
+        GADGET_CHECK_RETURN_FALSE(sizeX>=RO);
+        GADGET_CHECK_RETURN_FALSE(sizeY>=E1);
+        GADGET_CHECK_RETURN_FALSE(sizeZ>=E2);
+
+        GADGET_CHECK_RETURN_FALSE(filter3D.get_size(0)==sizeX);
+        GADGET_CHECK_RETURN_FALSE(filter3D.get_size(1)==sizeY);
+        GADGET_CHECK_RETURN_FALSE(filter3D.get_size(2)==sizeZ);
+
+        if ( RO==sizeX && E1==sizeY && E2==sizeZ )
+        {
+            dataResized = data;
+            return true;
+        }
+
+        if ( dataResized.get_size(0)!=sizeX || dataResized.get_size(1)!=sizeY || dataResized.get_size(2)!=sizeZ )
+        {
+            dataResized.create(sizeX, sizeY, sizeZ);
+        }
+
+        Gadgetron::clear(&dataResized);
+
+        hoNDArray<T> kspace(data);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, kspace));
+        GADGET_CHECK_RETURN_FALSE(this->zeropad3D(kspace, sizeX, sizeY, sizeZ, dataResized));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(filter3D, dataResized, dataResized));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataResized));
+
+        typename realType<T>::Type scaling = (typename realType<T>::Type)(std::sqrt((double)sizeX*sizeY*sizeZ)/std::sqrt((double)RO*E1*E2));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scaling, dataResized));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::zpadResize3DFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterROImage(hoNDArray<T>& data, const hoNDArray<T>& fRO)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(data));
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(data, fRO));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(data));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterROImage(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterROImage(const hoNDArray<T>& data, const hoNDArray<T>& fRO, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(data, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(dataFiltered, fRO));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterROImage(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterE1Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(data, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(dataFiltered, fE1, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterE1Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterE2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterRO(dataFiltered, fE2, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterE2Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterE1E2(dataFiltered, fE1, fE2, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterE1E2Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+kspacefilterROE1E2Image(const hoNDArray<T>& data, const hoNDArray<T>& fRO, const hoNDArray<T>& fE1, const hoNDArray<T>& fE2, hoNDArray<T>& dataFiltered)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==fRO.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==fE1.get_number_of_elements());
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==fE2.get_number_of_elements());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(data, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(this->kspacefilterROE1E2(dataFiltered, fRO, fE1, fE2, dataFiltered));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(dataFiltered));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::kspacefilterROE1E2Image(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        long long N = data.get_number_of_elements()/(RO*E1*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks;
+        long long halfKs = (long long)ks/2;
+
+        int e1;
+
+        #pragma omp parallel default(none) private(e1) shared(ks, RO, E1, CHA, pSen, pData, halfKs, power, kss)
+        {
+            hoMatrix<T> D(ks*ks, CHA);
+            T* pD = D.begin();
+
+            hoMatrix<T> DH_D(CHA, CHA);
+
+            hoMatrix<T> U1(ks*ks, 1);
+            T* pU1 = U1.begin();
+
+            hoMatrix<T> V1(CHA, 1);
+            T* pV1 = V1.begin();
+
+            hoMatrix<T> V(CHA, 1);
+
+            T phaseU1;
+
+            value_type v1Norm(1), u1Norm(1);
+
+            long long cha, ro, kro, ke1, de1, dro;
+            size_t po;
+
+            #pragma omp for
+            for ( e1=0; e1<(int)E1; e1++ )
+            {
+                for ( ro=0; ro<(long long)RO; ro++ )
+                {
+                    // fill the data matrix D
+                    if ( e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+                    {
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            const T* pDataCurr = pData + cha*RO*E1;
+                            int ind=0;
+                            for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                            {
+                                de1 = e1 + ke1;
+                                for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                {
+                                    D(ind++, cha) = pDataCurr[de1*RO+ro+kro];
+                                    //pD[ind+cha*kss] = pDataCurr[de1*RO+ro+kro];
+                                    //ind++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            const T* pDataCurr = pData + cha*RO*E1;
+                            int ind=0;
+                            for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                            {
+                                de1 = e1 + ke1;
+                                if ( de1 < 0 ) de1 += E1;
+                                if ( de1 >= E1 ) de1 -= E1;
+
+                                for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                {
+                                    dro = ro + kro;
+                                    if ( dro < 0 ) dro += RO;
+                                    if ( dro >= RO ) dro -= RO;
+
+                                    D(ind++, cha) = pDataCurr[de1*RO+dro];
+                                    //pD[ind+cha*kss] = pDataCurr[de1*RO+ro+kro];
+                                    //ind++;
+                                }
+                            }
+                        }
+                    }
+
+                    // compute V1
+                    D.sumOverCol(V1);
+                    norm2(V1, v1Norm);
+                    scal(1.0/v1Norm, V1);
+
+                    GeneralMatrixProduct_gemm(DH_D, D, true, D, false);
+
+                    for ( po=0; po<power; po++ )
+                    {
+                        GeneralMatrixProduct_gemm(V, DH_D, false, V1, false);
+                        memcpy(V1.begin(), V.begin(), V.get_number_of_bytes());
+                        norm2(V1, v1Norm);
+                        scal(1.0/v1Norm, V1);
+                    }
+
+                    // compute U1
+                    GeneralMatrixProduct_gemm(U1, D, false, V1, false);
+
+                    phaseU1 = pU1[0];
+                    for ( po=1; po<kss; po++ )
+                    {
+                        // phaseU1 += U1(po, 0);
+                        phaseU1 += pU1[po];
+                    }
+                    phaseU1 /= std::abs(phaseU1);
+
+                    // put the mean object phase to coil map
+                    conjugate(V1, V1);
+                    scal(phaseU1, V1);
+
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        // pSen[cha*RO*E1+e1*RO+ro] = pV1[cha];
+                        pSen[cha*RO*E1+e1*RO+ro] = V1(cha, 0);
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIHInner(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIHInner_2(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        long long N = data.get_number_of_elements()/(RO*E1*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks;
+        long long halfKs = (long long)ks/2;
+
+        long long e1, ro, cha;
+        long long kro, ke1, de1, dro;
+
+        // compute the D matrix
+        hoNDArray<T> D(kss, CHA, RO*E1);
+        T* pD = D.begin();
+
+        for ( e1=0; e1<(long long)E1; e1++ )
+        {
+            for ( ro=0; ro<(long long)RO; ro++ )
+            {
+                long long idx2D = ro + e1*RO;
+
+                // fill the data matrix D
+                if ( e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+                {
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        const T* pDataCurr = pData + cha*RO*E1;
+                        long long ind=0;
+                        for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                        {
+                            de1 = e1 + ke1;
+                            for ( kro=-halfKs; kro<=halfKs; kro++ )
+                            {
+                                long long idxD = idx2D*CHA*kss + cha*kss + ind;
+                                D(idxD) = pDataCurr[de1*RO+ro+kro];
+                                ind++;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        const T* pDataCurr = pData + cha*RO*E1;
+                        long long ind=0;
+                        for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                        {
+                            de1 = e1 + ke1;
+                            if ( de1 < 0 ) de1 += E1;
+                            if ( de1 >= E1 ) de1 -= E1;
+
+                            for ( kro=-halfKs; kro<=halfKs; kro++ )
+                            {
+                                dro = ro + kro;
+                                if ( dro < 0 ) dro += RO;
+                                if ( dro >= RO ) dro -= RO;
+
+                                long long idxD = idx2D*CHA*kss + cha*kss + ind;
+                                D(idxD) = pDataCurr[de1*RO+dro];
+                                ind++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // compute DH_D and V1
+        hoNDArray<T> DH_D(CHA, CHA, RO*E1);
+        T* pDH_D = DH_D.begin();
+
+        hoNDArray<T> V1(CHA, RO*E1);
+        T* pV1 = V1.begin();
+
+        hoNDArray<T> V(CHA, RO*E1);
+        T* pV = V.begin();
+
+        hoNDArray<T> U1(kss, RO*E1);
+        T* pU1 = U1.begin();
+
+        for ( e1=0; e1<(long long)E1; e1++ )
+        {
+            for ( ro=0; ro<(long long)RO; ro++ )
+            {
+                long long idx2D = ro + e1*RO;
+
+                hoNDArray<T> currD(kss, CHA, pD+idx2D*CHA*kss);
+                T* pCurrD = currD.begin();
+
+                hoNDArray<T> currDH_D(CHA, CHA, pDH_D+idx2D*CHA*CHA);
+
+                GeneralMatrixProduct(currDH_D, currD, true, currD, false);
+
+                hoNDArray<T> currV1(CHA, 1, pV1+idx2D*CHA);
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    currV1(cha) = 0;
+                    for ( size_t ii=0; ii<kss; ii++ )
+                    {
+                        currV1(cha) += pCurrD[ii+cha*kss];
+                    }
+                }
+
+                value_type v1Norm(1);
+
+                //norm2(currV1, v1Norm);
+                //scal(1.0/v1Norm, currV1);
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    T v = currV1(cha) * std::conj(currV1(cha));
+                    v1Norm += v.real();
+                }
+                v1Norm = std::sqrt(v1Norm);
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    currV1(cha) /= v1Norm;
+                }
+
+                size_t po;
+
+                hoNDArray<T> currV(CHA, 1, pV+idx2D*CHA);
+                for ( po=0; po<power; po++ )
+                {
+                    GeneralMatrixProduct(currV, currDH_D, false, currV1, false);
+                    currV1 = currV;
+                    /*norm2(currV1, v1Norm);
+                    scal(1.0/v1Norm, currV1);*/
+
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        T v = currV1(cha) * std::conj(currV1(cha));
+                        v1Norm += v.real();
+                    }
+                    v1Norm = std::sqrt(v1Norm);
+
+                    for ( cha=0; cha<CHA; cha++ )
+                    {
+                        currV1(cha) /= v1Norm;
+                    }
+                }
+
+                // compute U1
+                hoNDArray<T> currU1(kss, 1, pU1+idx2D*kss);
+                GeneralMatrixProduct(currU1, currD, false, currV1, false);
+
+                T phaseU1 = currU1(0);
+                for ( po=1; po<kss; po++ )
+                {
+                    phaseU1 += currU1(po);
+                }
+                phaseU1 /= std::abs(phaseU1);
+
+                // put the mean object phase to coil map
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    currV1(cha) = phaseU1 * std::conj(currV1(cha));
+                }
+
+                /*conjugate(currV1, currV1);
+                scal(phaseU1, currV1);*/
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    pSen[cha*RO*E1+idx2D] = currV1(cha);
+                }
+            }
+        }
+
+        //#pragma omp parallel default(none) private(e1) shared(ks, RO, E1, CHA, pSen, pData, halfKs, power, kss)
+        //{
+        //    hoMatrix<T> D(ks*ks, CHA);
+        //    hoMatrix<T> DH_D(CHA, CHA);
+
+        //    hoMatrix<T> U1(ks*ks, 1);
+        //    hoMatrix<T> V1(CHA, 1);
+        //    hoMatrix<T> V(CHA, 1);
+
+        //    T phaseU1;
+
+        //    value_type v1Norm(1), u1Norm(1);
+
+        //    long long cha, ro, kro, ke1, de1, dro;
+        //    size_t po;
+
+        //    #pragma omp for
+        //    for ( e1=0; e1<(long long)E1; e1++ )
+        //    {
+        //        for ( ro=0; ro<(long long)RO; ro++ )
+        //        {
+        //            // fill the data matrix D
+        //            if ( e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+        //            {
+        //                for ( cha=0; cha<CHA; cha++ )
+        //                {
+        //                    const T* pDataCurr = pData + cha*RO*E1;
+        //                    long long ind=0;
+        //                    for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+        //                    {
+        //                        de1 = e1 + ke1;
+        //                        for ( kro=-halfKs; kro<=halfKs; kro++ )
+        //                        {
+        //                            D(ind++, cha) = pDataCurr[de1*RO+ro+kro];
+        //                        }
+        //                    }
+        //                }
+        //            }
+        //            else
+        //            {
+        //                for ( cha=0; cha<CHA; cha++ )
+        //                {
+        //                    const T* pDataCurr = pData + cha*RO*E1;
+        //                    long long ind=0;
+        //                    for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+        //                    {
+        //                        de1 = e1 + ke1;
+        //                        if ( de1 < 0 ) de1 += E1;
+        //                        if ( de1 >= E1 ) de1 -= E1;
+
+        //                        for ( kro=-halfKs; kro<=halfKs; kro++ )
+        //                        {
+        //                            dro = ro + kro;
+        //                            if ( dro < 0 ) dro += RO;
+        //                            if ( dro >= RO ) dro -= RO;
+
+        //                            D(ind++, cha) = pDataCurr[de1*RO+dro];
+        //                        }
+        //                    }
+        //                }
+        //            }
+
+        //            // compute V1
+        //            D.sumOverCol(V1);
+        //            norm2(V1, v1Norm);
+        //            scal(1.0/v1Norm, V1);
+
+        //            GeneralMatrixProduct_gemm(DH_D, D, true, D, false);
+
+        //            for ( po=0; po<power; po++ )
+        //            {
+        //                GeneralMatrixProduct_gemm(V, DH_D, false, V1, false);
+        //                V1 = V;
+        //                norm2(V1, v1Norm);
+        //                scal(1.0/v1Norm, V1);
+        //            }
+
+        //            // compute U1
+        //            GeneralMatrixProduct_gemm(U1, D, false, V1, false);
+
+        //            phaseU1 = U1(0, 0);
+        //            for ( po=1; po<kss; po++ )
+        //            {
+        //                phaseU1 += U1(po, 0);
+        //            }
+        //            phaseU1 /= std::abs(phaseU1);
+
+        //            // put the mean object phase to coil map
+        //            conjugate(V1, V1);
+        //            scal(phaseU1, V1);
+
+        //            for ( cha=0; cha<CHA; cha++ )
+        //            {
+        //                pSen[cha*RO*E1+e1*RO+ro] = V1(cha, 0);
+        //            }
+        //        }
+        //    }
+        //}
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIHInner_2(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap3DNIHInner(const hoNDArray<T>& data, hoNDArray<T>& coilMap, size_t ks, size_t power)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long E2 = data.get_size(2);
+        long long CHA = data.get_size(3);
+
+        long long N = data.get_number_of_elements()/(RO*E1*E2*CHA);
+        GADGET_CHECK_RETURN_FALSE(N==1);
+
+        const T* pData = data.begin();
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+        T* pSen = coilMap.begin();
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t kss = ks*ks*ks;
+        long long halfKs = (long long)ks/2;
+
+        long long e2;
+
+        #pragma omp parallel default(none) private(e2) shared(ks, RO, E1, E2, CHA, pSen, pData, halfKs, power, kss)
+        {
+            hoMatrix<T> D(kss, CHA);
+            hoMatrix<T> DH_D(CHA, CHA);
+
+            hoMatrix<T> U1(kss, 1);
+            hoMatrix<T> V1(CHA, 1);
+            hoMatrix<T> V(CHA, 1);
+
+            T phaseU1;
+
+            value_type v1Norm(1);
+
+            long long cha, ro, e1, kro, dro, ke1, de1, ke2, de2;
+            size_t po;
+
+            #pragma omp for
+            for ( e2=0; e2<(long long)E2; e2++ )
+            {
+                for ( e1=0; e1<(long long)E1; e1++ )
+                {
+                    for ( ro=0; ro<(long long)RO; ro++ )
+                    {
+                        // fill the data matrix D
+                        if ( e2>=halfKs && e2<E2-halfKs && e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+                        {
+                            for ( cha=0; cha<CHA; cha++ )
+                            {
+                                const T* pDataCurr = pData + cha*RO*E1*E2;
+                                long long ind=0;
+                                for ( ke2=-halfKs; ke2<=halfKs; ke2++ )
+                                {
+                                    de2 = e2 + ke2;
+                                    for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                                    {
+                                        de1 = e1 + ke1;
+                                        for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                        {
+                                            D(ind++, cha) = pDataCurr[de2*RO*E1+de1*RO+ro+kro];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for ( cha=0; cha<CHA; cha++ )
+                            {
+                                const T* pDataCurr = pData + cha*RO*E1*E2;
+                                long long ind=0;
+                                for ( ke2=-halfKs; ke2<=halfKs; ke2++ )
+                                {
+                                    de2 = e2 + ke2;
+                                    if ( de2 < 0 ) de2 += E2;
+                                    if ( de2 >= E2 ) de2 -= E2;
+
+                                    for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                                    {
+                                        de1 = e1 + ke1;
+                                        if ( de1 < 0 ) de1 += E1;
+                                        if ( de1 >= E1 ) de1 -= E1;
+
+                                        for ( kro=-halfKs; kro<=halfKs; kro++ )
+                                        {
+                                            dro = ro + kro;
+                                            if ( dro < 0 ) dro += RO;
+                                            if ( dro >= RO ) dro -= RO;
+
+                                            D(ind++, cha) = pDataCurr[de2*RO*E1+de1*RO+dro];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        // compute V1
+                        D.sumOverCol(V1);
+                        norm2(V1, v1Norm);
+                        scal(1.0/v1Norm, V1);
+
+                        GeneralMatrixProduct_gemm(DH_D, D, true, D, false);
+
+                        for ( po=0; po<power; po++ )
+                        {
+                            GeneralMatrixProduct_gemm(V, DH_D, false, V1, false);
+                            V1 = V;
+                            norm2(V1, v1Norm);
+                            scal(1.0/v1Norm, V1);
+                        }
+
+                        // compute U1
+                        GeneralMatrixProduct_gemm(U1, D, false, V1, false);
+
+                        phaseU1 = U1(0, 0);
+                        for ( po=1; po<kss; po++ )
+                        {
+                            phaseU1 += U1(po, 0);
+                        }
+                        phaseU1 /= std::abs(phaseU1);
+
+                        // put the mean object phase to coil map
+                        conjugate(V1, V1);
+                        scal(phaseU1, V1);
+
+                        for ( cha=0; cha<CHA; cha++ )
+                        {
+                            pSen[cha*RO*E1*E2+e2*RO*E1+e1*RO+ro] = V1(cha, 0);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap3DNIHInner(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks, size_t power, size_t iterNum, typename realType<T>::Type thres, bool useGPU)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        long long RO = data.get_size(0);
+        long long E1 = data.get_size(1);
+        long long CHA = data.get_size(2);
+
+        #ifdef USE_CUDA
+            int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+            int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+            int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+
+            int numOfDevices = cudaDeviceManager::Instance()->getTotalNumberOfDevice();
+
+            if ( (numOfDevices==0) || (CHA>32) )
+            {
+                useGPU = false;
+            }
+
+        #else
+            useGPU = false;
+        #endif // USE_CUDA
+
+        if ( useGPU )
+        {
+            return coilMap2DNIHGPU(data, coilMap, algo, ks, power, iterNum, thres);
+        }
+        else
+        {
+            size_t N = data.get_number_of_elements()/(RO*E1*CHA);
+            size_t num = RO*E1*CHA;
+
+            if ( !data.dimensions_equal(&coilMap) )
+            {
+                coilMap = data;
+            }
+
+            if ( ks%2 != 1 )
+            {
+                ks++;
+            }
+
+            long long n;
+
+            if ( N >= 8 )
+            {
+                #ifdef GCC_OLD_FLAG
+                    #pragma omp parallel default(none) private(n) shared(ks, RO, E1, CHA, num, algo, N, power, iterNum, thres)
+                #else
+                    #pragma omp parallel default(none) private(n) shared(ks, RO, E1, CHA, num, algo, N, data, coilMap, power, iterNum, thres)
+                #endif 
+                {
+                    #pragma omp for
+                    for ( n=0; n<(long long)N; n++ )
+                    {
+                        hoNDArray<T> dataCurr(RO, E1, CHA, const_cast<T*>(data.begin()+n*num));
+                        hoNDArray<T> coilMapCurr(RO, E1, CHA, coilMap.begin()+n*num);
+
+                        if ( algo == ISMRMRD_SOUHEIL_ITER )
+                        {
+                            coilMap2DNIHInner(dataCurr, coilMapCurr, ks, power);
+                        }
+                        else
+                        {
+                            coilMap2DNIHInner(dataCurr, coilMapCurr, ks, power);
+                            //coilMap2DNIHInner_2(dataCurr, coilMapCurr, ks, power);
+                        }
+                    }
+                }
+            }
+            else if ( N == 1 )
+            {
+                if ( algo == ISMRMRD_SOUHEIL_ITER )
+                {
+                    GADGET_CHECK_RETURN_FALSE(coilMap2DNIHInner(data, coilMap, ks, power));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(coilMap2DNIHInner(data, coilMap, ks, power));
+                    //GADGET_CHECK_RETURN_FALSE(coilMap2DNIHInner_2(data, coilMap, ks, power));
+                }
+            }
+            else
+            {
+                for ( n=0; n<(long long)N; n++ )
+                {
+                    hoNDArray<T> dataCurr(RO, E1, CHA, const_cast<T*>(data.begin()+n*num));
+                    hoNDArray<T> coilMapCurr(RO, E1, CHA, coilMap.begin()+n*num);
+                    if ( algo == ISMRMRD_SOUHEIL_ITER )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(coilMap2DNIHInner(dataCurr, coilMapCurr, ks, power));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(coilMap2DNIHInner(dataCurr, coilMapCurr, ks, power));
+                        //GADGET_CHECK_RETURN_FALSE(coilMap2DNIHInner_2(dataCurr, coilMapCurr, ks, power));
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIH(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap2DNIHGPU(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks, size_t power, size_t iterNum, typename realType<T>::Type thres)
+{
+    try
+    {
+        #ifdef USE_CUDA
+            typedef typename realType<T>::Type value_type;
+
+            GADGET_MSG("call gpu version of coilMap2DNIH ... ");
+
+            long long RO = data.get_size(0);
+            long long E1 = data.get_size(1);
+            long long CHA = data.get_size(2);
+            long long N = data.get_number_of_elements()/(RO*E1*CHA);
+
+            if ( !data.dimensions_equal(&coilMap) )
+            {
+                coilMap = data;
+            }
+
+            if ( ks%2 != 1 )
+            {
+                ks++;
+            }
+
+            Gadgetron::GadgetronTimer gt_timer1_(false), gt_timer3_(false);
+
+            size_t kss = ks * ks;
+
+            bool gt3_timing = false;
+
+            if ( N == 1 )
+            {
+                Gadgetron::GadgetronTimer gt_timer1_(false), gt_timer3_(false);
+
+                cuNDArray<float_complext> device_data;
+                cuNDArray<float_complext> csm(data.get_dimensions());
+                Gadgetron::clear(&csm);
+                cuNDArray<float_complext > D(RO, E1, kss, CHA);
+                cuNDArray<float_complext > DH_D(RO, E1, CHA, CHA);
+                cuNDArray<float_complext > V1(RO, E1, CHA);
+                cuNDArray<float_complext > U1(RO, E1, kss);
+
+                // calling the b1_map estimation
+                const float_complext* pData = reinterpret_cast<const float_complext*>(data.begin());
+                hoNDArray<float_complext> data_tmp(RO, E1, CHA, const_cast<float_complext*>(pData));
+
+                GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("copy data to device ... "));
+                {
+                    device_data = data_tmp;
+                }
+                GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("estimate_b1_map_2D_NIH_Souheil ... "));
+                {
+                    Gadgetron::estimate_b1_map_2D_NIH_Souheil( &device_data, &csm, ks, power,
+                                                                D, DH_D, V1, U1 );
+                }
+                GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("coil map to host ... "));
+                {
+                    // boost::shared_ptr< hoNDArray<float_complext> > csm_host = csm.to_host();
+                    csm.to_host(reinterpret_cast<hoNDArray<float_complext>* >(&coilMap));
+                    //memcpy(coilMap.begin(), csm_host->begin(), csm_host->get_number_of_bytes());
+                }
+                GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+            }
+            else
+            {
+                size_t jobSize = sizeof(T)*RO*E1*(kss+CHA)*CHA*4.0;
+                size_t minimalMemoryForValidDevice = (size_t)(2.0*1024.0*1024*1024); // 2GB
+
+                std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > > jobSchedule;
+                if ( !this->cudaJobSplitter(N, jobSize, minimalMemoryForValidDevice, jobSchedule) )
+                {
+                    GADGET_ERROR_MSG("cudaJobSplitter failed, call the gpu coil map estimatoin ... ");
+                    return this->coilMap2DNIH(data, coilMap, algo, ks, power, iterNum, thres);
+                }
+
+                int device;
+                int numOfValidDevices = (int)jobSchedule.size();
+
+                for ( device=0; device<(int)numOfValidDevices; device++ )
+                {
+                    GADGET_MSG("GPU device " << jobSchedule[device].first << " has " << jobSchedule[device].second.size() << " jobs ... ");
+                    GADGET_MSG("Every job has " << jobSchedule[device].second[0].size() << " slics ... ");
+                }
+
+                #pragma omp parallel default(none) private(device) shared(numOfValidDevices, jobSchedule, RO, E1, CHA, kss, ks, power, data, coilMap, gt3_timing) num_threads(numOfValidDevices) if ( numOfValidDevices > 1 )
+                {
+                    int tid = 0;
+                    #ifdef USE_OMP
+                        tid = omp_get_thread_num();
+                    #endif // USE_OMP
+                    cudaSetDevice(jobSchedule[tid].first);
+
+                    Gadgetron::GadgetronTimer gt_timer1_(false), gt_timer3_(false);
+
+                    #pragma omp for
+                    for ( device=0; device<numOfValidDevices; device++ )
+                    {
+                        unsigned int totalJobPackage = jobSchedule[device].second.size();
+                        unsigned int usedN = jobSchedule[device].second[0].size();
+
+                        cuNDArray<float_complext> device_data;
+
+                        cuNDArray<float_complext> csm(RO, E1, usedN, CHA);
+                        Gadgetron::clear(&csm);
+
+                        cuNDArray<float_complext > D(RO*E1*usedN, kss, CHA);
+                        cuNDArray<float_complext > DH_D(RO*E1*usedN, CHA, CHA);
+                        cuNDArray<float_complext > V1(RO*E1*usedN, CHA);
+                        cuNDArray<float_complext > U1(RO*E1*usedN, kss);
+
+                        hoNDArray<T> dataCurr;
+                        hoNDArray<T> coilMapCurr;
+
+                        hoNDArray<T> dataTmp, coilMapTmp;
+
+                        unsigned int package;
+                        for ( package=0; package<totalJobPackage; package++ )
+                        {
+                            unsigned int packageSize = jobSchedule[device].second[package].size();
+                            size_t start = jobSchedule[device].second[package][0];
+                            size_t end = jobSchedule[device].second[package][packageSize-1];
+
+                            size_t usedNPackage = end-start+1;
+
+                            if ( usedNPackage != usedN )
+                            {
+                                usedN = usedNPackage;
+
+                                device_data.create(RO*E1*usedN, CHA);
+                                csm.create(RO*E1*usedN, CHA);
+                                D.create(RO*E1*usedN, kss, CHA);
+                                DH_D.create(RO*E1*usedN, CHA, CHA);
+                                V1.create(RO*E1*usedN, CHA);
+                                U1.create(RO*E1*usedN, kss);
+                            }
+
+                            dataTmp.create(RO, E1, CHA, usedN);
+                            memcpy(dataTmp.begin(), data.begin()+start*RO*E1*CHA, sizeof(T)*RO*E1*CHA*usedN);
+
+                            dataCurr.create(RO, E1, usedN, CHA);
+                            coilMapCurr.create(RO, E1, usedN, CHA);
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("permute the data ... "));
+                            Gadgetron::permuteLastTwoDimensions(dataTmp, dataCurr);
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            // calling the b1_map estimation
+                            hoNDArray<float_complext> data_tmp(dataCurr.get_dimensions(), reinterpret_cast<float_complext*>(dataCurr.begin()));
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("copy data to device ... "));
+                            {
+                                device_data = data_tmp;
+
+                                //{
+                                //boost::shared_ptr< hoNDArray<float_complext> > tmp = device_data.to_host();
+                                //hoNDArray<T> tmp_host(tmp->get_dimensions());
+
+                                //memcpy(tmp_host.begin(), tmp->begin(), tmp->get_number_of_bytes());
+                                //gtPlusIOAnalyze gt_io;
+                                //std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+                                //gt_io.exportArrayComplex(tmp_host, dstDir+"tmp");
+                                //}
+                            }
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("estimate_b1_map_2D_NIH_Souheil ... "));
+                            {
+                                Gadgetron::estimate_b1_map_2D_NIH_Souheil( &device_data, &csm, ks, power,
+                                                                            D, DH_D, V1, U1 );
+                            }
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("coil map to host ... "));
+                            {
+                                // csm.to_host(reinterpret_cast<hoNDArray<float_complext>* >(&coilMapCurr));
+                                boost::shared_ptr< hoNDArray<float_complext> > csm_host = csm.to_host();
+                                memcpy(coilMapCurr.begin(), csm_host->begin(), csm_host->get_number_of_bytes());
+                            }
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            coilMapTmp.create(RO, E1, CHA, usedN);
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("permute the coil map ... "));
+                            Gadgetron::permuteLastTwoDimensions(coilMapCurr, coilMapTmp);
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            memcpy(coilMap.begin()+start*RO*E1*CHA, coilMapTmp.begin(), sizeof(T)*RO*E1*CHA*usedN);
+                        }
+                    }
+                }
+            }
+        #else
+            return this->coilMap2DNIH(data, coilMap, algo, ks, power, iterNum, thres);
+        #endif // USE_CUDA
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap2DNIHGPU(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap3DNIH(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks, size_t power, size_t iterNum, typename realType<T>::Type thres, bool true3D)
+{
+    try
+    {
+        typedef typename realType<T>::Type value_type;
+
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t E2 = data.get_size(2);
+        size_t CHA = data.get_size(3);
+
+        size_t N = data.get_number_of_elements()/(RO*E1*E2*CHA);
+
+        if ( !data.dimensions_equal(&coilMap) )
+        {
+            coilMap = data;
+        }
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        hoNDArray<T> data2D, coilMap2D;
+
+        if ( algo == ISMRMRD_SOUHEIL )
+        {
+            data2D.create(RO, E1, CHA);
+            coilMap2D.create(RO, E1, CHA);
+        }
+
+        int n, e2;
+        for ( n=0; n<(long long)N; n++ )
+        {
+            if ( algo==ISMRMRD_SOUHEIL && E2>5*ks && true3D )
+            {
+                GADGET_MSG("calling 3D version of Souhiel coil map estimation ... ");
+                GADGET_CHECK_RETURN_FALSE(this->coilMap3DNIHInner(data, coilMap, ks, power));
+            }
+            else
+            {
+                hoNDArray<T> dataCurr(RO, E1, E2, CHA, const_cast<T*>(data.begin()+n*RO*E1*E2*CHA));
+                hoNDArray<T> coilMapCurr(RO, E1, E2, CHA, coilMap.begin()+n*RO*E1*E2*CHA);
+
+                #pragma omp parallel default(none) private(e2) shared(dataCurr, coilMapCurr, RO, E1, E2, CHA, algo, ks, power, iterNum, thres) if (E2>12)
+                {
+                    hoNDArray<T> data2D(RO, E1, CHA);
+                    hoNDArray<T> coilMap2D(RO, E1, CHA);
+
+                    Gadgetron::GadgetronTimer gt_timer3_(false);
+                    bool timing = false;
+
+                    #pragma omp for
+                    for ( e2=0; e2<(int)E2; e2++ )
+                    {
+                        long long cha;
+
+                        GADGET_CHECK_PERFORM(timing, gt_timer3_.start("memcpy 1 ... "));
+                        for ( cha=0; cha<(long long)CHA; cha++ )
+                        {
+                            memcpy(data2D.begin()+cha*RO*E1, dataCurr.begin()+cha*RO*E1*E2+e2*RO*E1, sizeof(T)*RO*E1);
+                        }
+                        GADGET_CHECK_PERFORM(timing, gt_timer3_.stop());
+
+                        //GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, data2D, "data2D");
+
+                        GADGET_CHECK_PERFORM(timing, gt_timer3_.start("coilMap2DNIHInner"));
+                        if ( algo == ISMRMRD_SOUHEIL_ITER )
+                        {
+                            coilMap2DNIHInner(data2D, coilMap2D, ks, power);
+                        }
+                        else
+                        {
+                            coilMap2DNIHInner(data2D, coilMap2D, ks, power);
+                        }
+                        GADGET_CHECK_PERFORM(timing, gt_timer3_.stop());
+
+                        //GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, coilMap2D, "coilMap2D");
+
+                        GADGET_CHECK_PERFORM(timing, gt_timer3_.start("memcpy 2 ... "));
+                        for ( cha=0; cha<(long long)CHA; cha++ )
+                        {
+                            memcpy(coilMapCurr.begin()+cha*RO*E1*E2+e2*RO*E1, coilMap2D.begin()+cha*RO*E1, sizeof(T)*RO*E1);
+                        }
+                        GADGET_CHECK_PERFORM(timing, gt_timer3_.stop());
+
+                        //GADGET_EXPORT_ARRAY_COMPLEX(debugFolder, gt_io, coilMapCurr, "coilMapCurr");
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap3DNIH(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilMap3DNIHGPU_FullResMap(const hoNDArray<T>& data, hoNDArray<T>& coilMap, ISMRMRDCOILMAPALGO algo, size_t ks, size_t power, size_t iterNum, typename realType<T>::Type thres, bool true3D)
+{
+    try
+    {
+        #ifdef USE_CUDA
+            typedef typename realType<T>::Type value_type;
+
+            GADGET_MSG("compute full resolution coil map using gpu ... ");
+
+            size_t RO = data.get_size(0);
+            size_t E1 = data.get_size(1);
+            size_t E2 = data.get_size(2);
+            size_t CHA = data.get_size(3);
+
+            if ( !data.dimensions_equal(&coilMap) )
+            {
+                coilMap = data;
+            }
+
+            if ( ks%2 != 1 )
+            {
+                ks++;
+            }
+
+            size_t kss = ks*ks;
+
+            int numOfDevices = cudaDeviceManager::Instance()->getTotalNumberOfDevice();
+            if ( (numOfDevices==0) || (CHA>32) )
+            {
+                return this->coilMap3DNIH(data, coilMap, algo, ks, power, iterNum, thres);
+            }
+
+            size_t jobSize = sizeof(T)*RO*E1*(kss+CHA)*CHA*2.0;
+            size_t minimalMemoryForValidDevice = (size_t)(2.0*1024.0*1024*1024); // 4GB
+
+            std::vector< std::pair<unsigned int, std::vector<std::vector<unsigned int> > > > jobSchedule;
+            if ( !this->cudaJobSplitter(E2, jobSize, minimalMemoryForValidDevice, jobSchedule) )
+            {
+                GADGET_ERROR_MSG("cudaJobSplitter failed, call the gpu coil map estimatoin ... ");
+                return this->coilMap3DNIH(data, coilMap, algo, ks, power, iterNum, thres);
+            }
+
+            unsigned int numOfValidDevices = jobSchedule.size();
+
+            int device;
+            for ( device=0; device<(int)numOfValidDevices; device++ )
+            {
+                GADGET_MSG("GPU device " << jobSchedule[device].first << " has " << jobSchedule[device].second.size() << " jobs ... ");
+                GADGET_MSG("Every job has " << jobSchedule[device].second[0].size() << " slics ... ");
+            }
+
+            size_t N = data.get_number_of_elements()/(RO*E1*E2*CHA);
+
+            bool gt3_timing = false;
+
+            long long n;
+            for ( n=0; n<(long long)N; n++ )
+            {
+                hoNDArray<T> dataCurr(RO, E1, E2, CHA, const_cast<T*>(data.begin()+n*RO*E1*E2*CHA));
+                hoNDArray<T> coilMapCurr(RO, E1, E2, CHA, coilMap.begin()+n*RO*E1*E2*CHA);
+
+                #pragma omp parallel default(none) private(device) shared(jobSchedule, dataCurr, coilMapCurr, RO, E1, E2, CHA, algo, ks, kss, power, iterNum, thres, numOfValidDevices, gt3_timing) num_threads(numOfValidDevices) if ( numOfValidDevices > 1 )
+                {
+                    int tid = 0;
+                    #ifdef USE_OMP
+                        tid = omp_get_thread_num();
+                    #endif // USE_OMP
+                    cudaSetDevice(jobSchedule[tid].first);
+
+                    Gadgetron::GadgetronTimer gt_timer1_(false), gt_timer3_(false);
+
+                    #pragma omp for
+                    for ( device=0; device<(int)numOfValidDevices; device++ )
+                    {
+                        int totalJobPackage = jobSchedule[tid].second.size();
+
+                        unsigned int usedN = jobSchedule[tid].second[0].size();
+
+                        cuNDArray<float_complext> device_data(RO, E1, usedN, CHA);
+                        cuNDArray<float_complext> csm(RO, E1, usedN, CHA);
+                        cuNDArray<float_complext > D(RO, E1, usedN, kss, CHA);
+                        cuNDArray<float_complext > DH_D(RO, E1, usedN, CHA, CHA);
+                        cuNDArray<float_complext > V1(RO, E1, usedN, CHA);
+                        cuNDArray<float_complext > U1(RO, E1, usedN, kss);
+
+                        hoNDArray<T> dataCurrN;
+                        hoNDArray<T> coilMapCurrN;
+
+                        int ii;
+                        for ( ii=0; ii<totalJobPackage; ii++ )
+                        {
+                            hoNDArray<T> dataTmp, coilMapTmp;
+
+                            unsigned int packageSize = jobSchedule[tid].second[ii].size();
+
+                            size_t start = jobSchedule[tid].second[ii][0];
+                            size_t end = jobSchedule[tid].second[ii][packageSize-1];
+
+                            size_t usedNCurr = end-start+1;
+
+                            if ( usedNCurr != usedN )
+                            {
+                                usedN = usedNCurr;
+
+                                device_data.create(RO, E1, usedN, CHA);
+                                csm.create(RO, E1, usedN, CHA);
+                                D.create(RO, E1, usedN, kss, CHA);
+                                DH_D.create(RO, E1, usedN, CHA, CHA);
+                                V1.create(RO, E1, usedN, CHA);
+                                U1.create(RO, E1, usedN, kss);
+                            }
+
+                            dataCurrN.create(RO, E1, usedN, CHA);
+                            coilMapCurrN.create(RO, E1, usedN, CHA);
+
+                            Gadgetron::cropOver3rdDimension(dataCurr, dataCurrN, start, end);
+
+                            // calling the b1_map estimation
+                            hoNDArray<float_complext> data_tmp(dataCurrN.get_dimensions(), reinterpret_cast<float_complext*>(dataCurrN.begin()));
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("copy data to device ... "));
+                            {
+                                device_data = data_tmp;
+                            }
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("estimate_b1_map_2D_NIH_Souheil ... "));
+                            {
+                                Gadgetron::estimate_b1_map_2D_NIH_Souheil( &device_data, &csm, ks, power,
+                                                                            D, DH_D, V1, U1 );
+                            }
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.start("coil map to host ... "));
+                            {
+                                csm.to_host(reinterpret_cast<hoNDArray<float_complext>* >(&coilMapCurrN));
+                            }
+                            GADGET_CHECK_PERFORM(gt3_timing, gt_timer3_.stop());
+
+                            Gadgetron::setSubArrayOver3rdDimension(coilMapCurrN, coilMapCurr, start, end);
+                        }
+                    }
+                }
+            }
+        #else
+            return this->coilMap3DNIH(data, coilMap, algo, ks, power, iterNum, thres);
+        #endif // USE_CUDA
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilMap3DNIHGPU_FullResMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+sumOfSquare(const hoNDArray<T>& data, hoNDArray<T>& sos)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        GADGET_CHECK_RETURN_FALSE(NDim>=3);
+
+        hoNDArray<T> tmp(data);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyConj(data, data, tmp));
+
+        if ( NDim == 3 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(tmp, sos));
+        }
+        else if ( NDim == 4 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverSecondLastDimension(tmp, sos));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver3rdDimension(tmp, sos));
+        }
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sqrt(sos, sos));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::sumOfSquare(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilCombine(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        size_t NDimCoil = coilMap.get_number_of_dimensions();
+
+        // GADGET_CHECK_RETURN_FALSE(NDimCoil<=NDim);
+        GADGET_CHECK_RETURN_FALSE(data.get_number_of_elements()>=coilMap.get_number_of_elements());
+
+        size_t n;
+        for ( n=0; n<NDimCoil; n++ )
+        {
+            if ( n<NDim && coilMap.get_size(n)>1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(data.get_size(n)==coilMap.get_size(n));
+            }
+        }
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        boost::shared_ptr< std::vector<size_t> > dimCoil = coilMap.get_dimensions();
+
+        size_t N = coilMap.get_number_of_elements();
+        size_t num = data.get_number_of_elements()/coilMap.get_number_of_elements();
+
+        std::vector<size_t> dimCombined(*dim);
+        dimCombined.erase(dimCombined.begin()+2);
+        combined.create(&dimCombined);
+
+        std::vector<size_t> dimCombinedCurr(*dimCoil);
+        dimCombinedCurr[2] = 1;
+
+        size_t NCombined = combined.get_number_of_elements()/num;
+
+        long long nn;
+        //#ifdef GCC_OLD_FLAG
+        //    #pragma omp parallel default(none) private(nn) shared(num, dimCoil, dimCombinedCurr, N, NCombined)
+        //#else
+        //    #pragma omp parallel default(none) private(nn) shared(data, coilMap, num, dimCoil, dimCombinedCurr, combined, N, NCombined)
+        //#endif
+        {
+            hoNDArray<T> dataTmp(coilMap);
+            hoNDArray<T> dataCurr;
+            hoNDArray<T> dataCombinedCurr;
+
+            //#pragma omp for
+            for ( nn=0; nn<(long long)num; nn++ )
+            {
+                dataCurr.create(dimCoil.get(), const_cast<T*>(data.begin()+nn*N));
+                Gadgetron::multiplyConj(dataCurr, coilMap, dataTmp);
+
+                dataCombinedCurr.create(&dimCombinedCurr, const_cast<T*>(combined.begin()+nn*NCombined));
+                Gadgetron::sumOver3rdDimension(dataTmp, dataCombinedCurr);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilCombine(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+coilCombine3D(const hoNDArray<T>& data, const hoNDArray<T>& coilMap, hoNDArray<T>& combined)
+{
+    try
+    {
+        size_t NDim = data.get_number_of_dimensions();
+        size_t NDimCoil = coilMap.get_number_of_dimensions();
+
+        // GADGET_CHECK_RETURN_FALSE(NDimCoil<=NDim);
+        GADGET_CHECK_RETURN_FALSE(data.get_number_of_elements()>=coilMap.get_number_of_elements());
+
+        /*size_t n;
+        for ( n=0; n<NDimCoil; n++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(data.get_size(n)==coilMap.get_size(n));
+        }*/
+
+        GADGET_CHECK_RETURN_FALSE(data.get_size(0)==coilMap.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(1)==coilMap.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(2)==coilMap.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(data.get_size(3)==coilMap.get_size(3));
+
+        boost::shared_ptr< std::vector<size_t> > dim = data.get_dimensions();
+        boost::shared_ptr< std::vector<size_t> > dimCoil = coilMap.get_dimensions();
+
+        size_t N = coilMap.get_number_of_elements();
+        size_t num = data.get_number_of_elements()/coilMap.get_number_of_elements();
+
+        std::vector<size_t> dimCombined(*dim);
+        dimCombined.erase(dimCombined.begin()+3);
+        combined.create(&dimCombined);
+
+        std::vector<size_t> dimCombinedCurr(*dimCoil);
+        dimCombinedCurr[3] = 1;
+
+        size_t NCombined = combined.get_number_of_elements()/num;
+
+        long long nn;
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(nn) shared(num, dimCoil, dimCombinedCurr, N, NCombined) if (num>=6)
+        #else
+            #pragma omp parallel default(none) private(nn) shared(data, coilMap, num, dimCoil, dimCombinedCurr, combined, N, NCombined) if (num>=6)
+        #endif
+        {
+            hoNDArray<T> dataTmp(coilMap);
+
+            #pragma omp for
+            for ( nn=0; nn<(long long)num; nn++ )
+            {
+                hoNDArray<T> dataCurr(dimCoil.get(), const_cast<T*>(data.begin()+nn*N));
+                Gadgetron::multiplyConj(dataCurr, coilMap, dataTmp);
+
+                hoNDArray<T> dataCombinedCurr(&dimCombinedCurr, const_cast<T*>(combined.begin()+nn*NCombined));
+                Gadgetron::sumOver4thDimension(dataTmp, dataCombinedCurr);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::coilCombine3D(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+conjugateSymmetry2D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj)
+{
+    try
+    {
+        if ( !kspaceConj.dimensions_equal(&kspace) )
+        {
+            kspaceConj.create(kspace.get_dimensions());
+        }
+
+        long long RO = kspace.get_size(0);
+        long long E1 = kspace.get_size(1);
+        long long num = kspace.get_number_of_elements()/(RO*E1);
+
+        long long centerRO = RO/2;
+        long long centerE1 = E1/2;
+
+        long long ii;
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(ii) shared(RO, E1, num, centerRO, centerE1)
+        #else
+            #pragma omp parallel for default(none) private(ii) shared(RO, E1, num, centerRO, centerE1, kspace, kspaceConj)
+        #endif
+        for ( ii=0; ii<num; ii++ )
+        {
+            ho2DArray<T> src(RO, E1, const_cast<T*>(kspace.begin()+ii*RO*E1));
+            ho2DArray<T> dst(RO, E1, const_cast<T*>(kspaceConj.begin()+ii*RO*E1));
+
+            long long ro, e1;
+            long long cro, ce1;
+
+            for ( e1=0; e1<E1; e1++ )
+            {
+                ce1 = 2*centerE1-e1;
+                if ( ce1 > E1-1 )
+                {
+                    ce1 -= E1;
+                }
+
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    cro = 2*centerRO-ro;
+                    if ( cro > RO-1 )
+                    {
+                        cro -= RO;
+                    }
+
+                    dst(ro, e1) = std::conj(src(cro, ce1));
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::conjugateSymmetry2D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconUtilComplex<T>::
+conjugateSymmetry3D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj)
+{
+    try
+    {
+        if ( !kspaceConj.dimensions_equal(&kspace) )
+        {
+            kspaceConj.create(kspace.get_dimensions());
+        }
+
+        long long RO = kspace.get_size(0);
+        long long E1 = kspace.get_size(1);
+        long long E2 = kspace.get_size(2);
+        long long num = kspace.get_number_of_elements()/(RO*E1*E2);
+
+        long long centerRO = RO/2;
+        long long centerE1 = E1/2;
+        long long centerE2 = E2/2;
+
+        long long ii;
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel for default(none) private(ii) shared(RO, E1, E2, num, centerRO, centerE1, centerE2)
+        #else
+            #pragma omp parallel for default(none) private(ii) shared(RO, E1, E2, num, centerRO, centerE1, centerE2, kspace, kspaceConj)
+        #endif
+        for ( ii=0; ii<num; ii++ )
+        {
+            ho3DArray<T> src(RO, E1, E2, const_cast<T*>(kspace.begin()+ii*RO*E1*E2));
+            ho3DArray<T> dst(RO, E1, E2, const_cast<T*>(kspaceConj.begin()+ii*RO*E1*E2));
+
+            long long ro, e1, e2;
+            long long cro, ce1, ce2;
+
+            for ( e2=0; e2<E2; e2++ )
+            {
+                ce2 = 2*centerE2-e2;
+                if ( ce2 > E2-1 )
+                {
+                    ce2 -= E2;
+                }
+
+                for ( e1=0; e1<E1; e1++ )
+                {
+                    ce1 = 2*centerE1-e1;
+                    if ( ce1 > E1-1 )
+                    {
+                        ce1 -= E1;
+                    }
+
+                    for ( ro=0; ro<RO; ro++ )
+                    {
+                        cro = 2*centerRO-ro;
+                        if ( cro > RO-1 )
+                        {
+                            cro -= RO;
+                        }
+
+                        dst(ro, e1, e2) = std::conj(src(cro, ce1, ce2));
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconUtilComplex<T>::conjugateSymmetry3D(const hoNDArray<T>& kspace, hoNDArray<T>& kspaceConj) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h
new file mode 100644
index 0000000..9564866
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlow.h
@@ -0,0 +1,525 @@
+/** \file   gtPlusISMRMRDReconWorkFlow.h
+    \brief  Define the base class for the GtPlus reconstruction workflow
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+
+#include "util/gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+struct DimensionRecordCompare
+{
+    DimensionRecordCompare() {}
+    ~DimensionRecordCompare() {}
+
+    bool operator()(const std::pair<ISMRMRDDIM, size_t>& a, const std::pair<ISMRMRDDIM, size_t>& b) const
+    {
+        return (a.second > b.second);
+    }
+};
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlow
+{
+public:
+
+    typedef std::pair<ISMRMRDDIM, size_t> DimensionRecordType;
+
+    gtPlusISMRMRDReconWorkFlow();
+    gtPlusISMRMRDReconWorkFlow(gtPlusReconWorker<T>& worker, gtPlusReconWorkOrder<T>& workOrder);
+    virtual ~gtPlusISMRMRDReconWorkFlow();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool preProcessing() = 0;
+
+    virtual bool recon() = 0;
+
+    virtual bool postProcessing() = 0;
+
+    // assemble the ISMRMRD dimension index
+    // ind must have 9 elements
+    bool ismrmrdDimIndex9D(std::vector<size_t>& ind, const ISMRMRDDIM& dim, size_t value);
+
+    // find the permute order for ISMRMRD
+    bool findISMRMRDPermuteOrder(const std::vector<ISMRMRDDIM>& dimsSrc, const std::vector<ISMRMRDDIM>& dimsDst, std::vector<size_t>& order);
+
+    // print the dimension names
+    std::string printISMRMRDDimensions(const std::vector<ISMRMRDDIM>& dims);
+
+    // print the dimension size
+    std::string printISMRMRDDimensionSize(const std::vector<size_t>& sizes);
+
+    bool setDataArray(hoNDArray<T>& data);
+    bool setRefArray(hoNDArray<T>& ref);
+
+    // -------- these member variables are made as public ------------- //
+
+    // recon worker to do the computation
+    gtPlusReconWorker<T>* worker_;
+
+    // recon work order
+    gtPlusReconWorkOrder<T>* workOrder_;
+
+    // ----------------------------------
+    // noise prewhitening
+    // ----------------------------------
+    // noise scan, 3D array [RO E1 CHA]
+    hoNDArray<T>* noise_;
+
+    // noise bandwidth (Hz/pixel)
+    double noiseBW_;
+
+    // noise equivalent bandwidth ratio for receiver
+    double receriverBWRatio_;
+
+    // ADC sampling time in second
+    double ADCSamplingTimeinSecond_;
+
+    // RO oversampling ratio
+    double overSamplingRatioRO_;
+
+    // ----------------------------------
+    // final image sizes for RO/E1/E2
+    // ----------------------------------
+    size_t reconSizeRO_;
+    size_t reconSizeE1_;
+    size_t reconSizeE2_;
+
+    float encodingFOV_RO_;
+    float encodingFOV_E1_;
+    float encodingFOV_E2_;
+
+    float reconFOV_RO_;
+    float reconFOV_E1_;
+    float reconFOV_E2_;
+
+    // ----------------------------------
+    // dimension and starting indexes for this data_
+    // in case this data_ is a portion of a larger dataset
+    // ----------------------------------
+    std::vector< DimensionRecordType > dataDimStartingIndexes_;
+
+    // ----------------------------------
+    // reconstruction results, complex images, 8D array [RO E1 SLC E2 CON PHS REP SET]
+    // ----------------------------------
+    hoNDArray<T> res_;
+
+    // ----------------------------------
+    // debug and timing
+    // ----------------------------------
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+protected:
+
+    // ----------------------------------
+    // input data array
+    // ----------------------------------
+    // image data, [RO E1 CHA SLC E2 CON PHS REP SET SEG]
+    hoNDArray<T>* data_;
+
+    // reference calibration, [RO E1 CHA SLC E2 CON PHS REP SET SEG]
+    hoNDArray<T>* ref_;
+
+    // internal helper memory allocated for computation
+    hoNDArray<T> dataCurr_;
+    hoNDArray<T> refCurr_;
+
+    // size of dimensions for image data
+    DimensionRecordType RO_;
+    DimensionRecordType E1_;
+    DimensionRecordType CHA_;
+    DimensionRecordType SLC_;
+    DimensionRecordType E2_;
+    DimensionRecordType CON_;
+    DimensionRecordType PHS_;
+    DimensionRecordType REP_;
+    DimensionRecordType SET_;
+    DimensionRecordType SEG_;
+
+    // size of dimensions for ref data
+    DimensionRecordType RO_ref_;
+    DimensionRecordType E1_ref_;
+    DimensionRecordType CHA_ref_;
+    DimensionRecordType SLC_ref_;
+    DimensionRecordType E2_ref_;
+    DimensionRecordType CON_ref_;
+    DimensionRecordType PHS_ref_;
+    DimensionRecordType REP_ref_;
+    DimensionRecordType SET_ref_;
+    DimensionRecordType SEG_ref_;
+
+    // expected dimensions for results
+    std::vector<ISMRMRDDIM> dimsRes_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlow<T>::gtPlusISMRMRDReconWorkFlow() 
+: data_(NULL), ref_(NULL), worker_(NULL), workOrder_(NULL), noise_(NULL), noiseBW_(1.0), receriverBWRatio_(1.0), overSamplingRatioRO_(1.0), ADCSamplingTimeinSecond_(1.0) , performTiming_(false)
+{
+    RO_.first = DIM_ReadOut;
+    RO_.second = 1;
+
+    E1_.first = DIM_Encoding1;
+    E1_.second = 1;
+
+    CHA_.first = DIM_Channel;
+    CHA_.second = 1;
+
+    SLC_.first = DIM_Slice;
+    SLC_.second = 1;
+
+    E2_.first = DIM_Encoding2;
+    E2_.second = 1;
+
+    CON_.first = DIM_Contrast;
+    CON_.second = 1;
+
+    PHS_.first = DIM_Phase;
+    PHS_.second = 1;
+
+    REP_.first = DIM_Repetition;
+    REP_.second = 1;
+
+    SET_.first = DIM_Set;
+    SET_.second = 1;
+
+    SEG_.first = DIM_Segment;
+    SEG_.second = 1;
+
+    RO_ref_.first = DIM_ReadOut;
+    RO_ref_.second = 1;
+
+    E1_ref_.first = DIM_Encoding1;
+    E1_ref_.second = 1;
+
+    CHA_ref_.first = DIM_Channel;
+    CHA_ref_.second = 1;
+
+    SLC_ref_.first = DIM_Slice;
+    SLC_ref_.second = 1;
+
+    E2_ref_.first = DIM_Encoding2;
+    E2_ref_.second = 1;
+
+    CON_ref_.first = DIM_Contrast;
+    CON_ref_.second = 1;
+
+    PHS_ref_.first = DIM_Phase;
+    PHS_ref_.second = 1;
+
+    REP_ref_.first = DIM_Repetition;
+    REP_ref_.second = 1;
+
+    SET_ref_.first = DIM_Set;
+    SET_ref_.second = 1;
+
+    SEG_ref_.first = DIM_Segment;
+    SEG_ref_.second = 1;
+
+    dimsRes_.resize(9);
+    dimsRes_[0] = DIM_ReadOut;
+    dimsRes_[1] = DIM_Encoding1;
+    dimsRes_[2] = DIM_Channel;
+    dimsRes_[3] = DIM_Slice;
+    dimsRes_[4] = DIM_Encoding2;
+    dimsRes_[5] = DIM_Contrast;
+    dimsRes_[6] = DIM_Phase;
+    dimsRes_[7] = DIM_Repetition;
+    dimsRes_[8] = DIM_Set;
+
+    dataDimStartingIndexes_.resize(10);
+    dataDimStartingIndexes_[0] = DimensionRecordType(DIM_ReadOut, 0);
+    dataDimStartingIndexes_[1] = DimensionRecordType(DIM_Encoding1, 0);
+    dataDimStartingIndexes_[2] = DimensionRecordType(DIM_Channel, 0);
+    dataDimStartingIndexes_[3] = DimensionRecordType(DIM_Slice, 0);
+    dataDimStartingIndexes_[4] = DimensionRecordType(DIM_Encoding2, 0);
+    dataDimStartingIndexes_[5] = DimensionRecordType(DIM_Contrast, 0);
+    dataDimStartingIndexes_[6] = DimensionRecordType(DIM_Phase, 0);
+    dataDimStartingIndexes_[7] = DimensionRecordType(DIM_Repetition, 0);
+    dataDimStartingIndexes_[8] = DimensionRecordType(DIM_Set, 0);
+    dataDimStartingIndexes_[9] = DimensionRecordType(DIM_Segment, 0);
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlow<T>::gtPlusISMRMRDReconWorkFlow(gtPlusReconWorker<T>& worker, gtPlusReconWorkOrder<T>& workOrder) 
+: data_(NULL), ref_(NULL), worker_(&worker), workOrder_(&workOrder), noise_(NULL),
+  noiseBW_(1.0), receriverBWRatio_(1.0), overSamplingRatioRO_(1.0), ADCSamplingTimeinSecond_(1.0) , performTiming_(false)
+{
+    RO_.second = 1;
+    E1_.second = 1;
+    CHA_.second = 1;
+    SLC_.second = 1;
+    E2_.second = 1;
+    CON_.second = 1;
+    PHS_.second = 1;
+    REP_.second = 1;
+    SET_.second = 1;
+    SEG_.second = 1;
+
+    RO_ref_.second = 1;
+    E1_ref_.second = 1;
+    CHA_ref_.second = 1;
+    SLC_ref_.second = 1;
+    E2_ref_.second = 1;
+    CON_ref_.second = 1;
+    PHS_ref_.second = 1;
+    REP_ref_.second = 1;
+    SET_ref_.second = 1;
+    SEG_ref_.second = 1;
+
+    dimsRes_.resize(9);
+    dimsRes_[0] = DIM_ReadOut;
+    dimsRes_[1] = DIM_Encoding1;
+    dimsRes_[2] = DIM_Channel;
+    dimsRes_[3] = DIM_Slice;
+    dimsRes_[4] = DIM_Encoding2;
+    dimsRes_[5] = DIM_Contrast;
+    dimsRes_[6] = DIM_Phase;
+    dimsRes_[7] = DIM_Repetition;
+    dimsRes_[8] = DIM_Set;
+
+    dataDimStartingIndexes_.resize(10);
+    dataDimStartingIndexes_[0] = DimensionRecordType(DIM_ReadOut, 0);
+    dataDimStartingIndexes_[1] = DimensionRecordType(DIM_Encoding1, 0);
+    dataDimStartingIndexes_[2] = DimensionRecordType(DIM_Channel, 0);
+    dataDimStartingIndexes_[3] = DimensionRecordType(DIM_Slice, 0);
+    dataDimStartingIndexes_[4] = DimensionRecordType(DIM_Encoding2, 0);
+    dataDimStartingIndexes_[5] = DimensionRecordType(DIM_Contrast, 0);
+    dataDimStartingIndexes_[6] = DimensionRecordType(DIM_Phase, 0);
+    dataDimStartingIndexes_[7] = DimensionRecordType(DIM_Repetition, 0);
+    dataDimStartingIndexes_[8] = DimensionRecordType(DIM_Set, 0);
+    dataDimStartingIndexes_[9] = DimensionRecordType(DIM_Segment, 0);
+
+    gt_timer1_.set_timing_in_destruction(false);
+    gt_timer2_.set_timing_in_destruction(false);
+    gt_timer3_.set_timing_in_destruction(false);
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlow<T>::~gtPlusISMRMRDReconWorkFlow() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlow<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow -------------" << endl;
+    os << "Implementation of general reconstruction workflow for ISMRMRD convention" << endl;
+    os << "the gtPlusISMRMRDReconWorkFlow defines and implements the reconstruction workflow for the ISMRMRD definition" << endl;
+    os << "the reconstruction is split into three stages:" << endl;
+    os << "1) PreProcessing" << endl;
+    os << "2) Reconstruction" << endl;
+    os << "3) PostProcessing" << endl;
+    os << endl;
+    os << "These three steps can have different operations for different sampling patterns or imaging applications" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+inline bool gtPlusISMRMRDReconWorkFlow<T>::
+ismrmrdDimIndex9D(std::vector<size_t>& ind, const ISMRMRDDIM& dim, size_t value)
+{
+    GADGET_CHECK_RETURN_FALSE(ind.size()>(size_t)(dim-DIM_ReadOut));
+    ind[dim-DIM_ReadOut] = value;
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::
+findISMRMRDPermuteOrder(const std::vector<ISMRMRDDIM>& dimsSrc, const std::vector<ISMRMRDDIM>& dimsDst, std::vector<size_t>& order)
+{
+    GADGET_CHECK_RETURN_FALSE(dimsSrc.size()==dimsDst.size());
+    order.resize(dimsSrc.size());
+
+    size_t NDim = dimsSrc.size();
+    size_t src, dst;
+
+    for ( dst=0; dst<NDim; dst++ )
+    {
+        for ( src=0; src<NDim; src++ )
+        {
+            if ( dimsSrc[src] == dimsDst[dst] )
+                break;
+        }
+
+        order[dst] = src;
+    }
+
+    return true;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconWorkFlow<T>::
+printISMRMRDDimensions(const std::vector<ISMRMRDDIM>& dims)
+{
+    using namespace std;
+
+    if ( dims.empty() ) return std::string("[ ]");
+
+    size_t NDim = dims.size();
+
+    size_t ii;
+
+    std::ostringstream os;
+
+    os << "[ ";
+    for ( ii=0; ii<NDim; ii++ )
+    {
+        ISMRMRDDIM dim = dims[ii];
+        switch (dim)
+        {
+            case DIM_ReadOut:
+                os << "DIM_ReadOut ";
+            break;
+
+            case DIM_Encoding1:
+                os << "Encoding1 ";
+            break;
+
+            case DIM_Channel:
+                os << "Channel ";
+            break;
+
+            case DIM_Slice:
+                os << "Slice ";
+            break;
+
+            case DIM_Encoding2:
+                os << "Encoding2 ";
+            break;
+
+            case DIM_Contrast:
+                os << "Contrast ";
+            break;
+
+            case DIM_Phase:
+                os << "Phase ";
+            break;
+
+            case DIM_Repetition:
+                os << "Repitition ";
+            break;
+
+            case DIM_Set:
+                os << "Set ";
+            break;
+
+            case DIM_Segment:
+                os << "Segment ";
+            break;
+
+            default:
+                os << " Other";
+        }
+    }
+    os << "]" << endl;
+
+    std::string dimStr(os.str());
+    return dimStr;
+}
+
+template <typename T> 
+std::string gtPlusISMRMRDReconWorkFlow<T>::
+printISMRMRDDimensionSize(const std::vector<size_t>& sizes)
+{
+    using namespace std;
+
+    if ( sizes.empty() ) return std::string("[ ]");
+
+    size_t NDim = sizes.size();
+
+    size_t ii;
+
+    std::ostringstream os;
+
+    os << "[ ";
+    for ( ii=0; ii<NDim; ii++ )
+    {
+        os << sizes[ii] << " ";
+    }
+    os << "]" << endl;
+
+    std::string sizeStr(os.str());
+    return sizeStr;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::setDataArray(hoNDArray<T>& data)
+{
+    try
+    {
+        data_ = &data;
+
+        RO_.second = data.get_size(0);
+        E1_.second = data.get_size(1);
+        CHA_.second = data.get_size(2);
+        SLC_.second = data.get_size(3);
+        E2_.second = data.get_size(4);
+        CON_.second = data.get_size(5);
+        PHS_.second = data.get_size(6);
+        REP_.second = data.get_size(7);
+        SET_.second = data.get_size(8);
+        SEG_.second = data.get_size(9);
+    }
+    catch(...)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlow<T>::setRefArray(hoNDArray<T>& ref)
+{
+    try
+    {
+        ref_ = &ref;
+
+        RO_ref_.second     = ref.get_size(0);
+        E1_ref_.second     = ref.get_size(1);
+        CHA_ref_.second    = ref.get_size(2);
+        SLC_ref_.second    = ref.get_size(3);
+        E2_ref_.second     = ref.get_size(4);
+        CON_ref_.second    = ref.get_size(5);
+        PHS_ref_.second    = ref.get_size(6);
+        REP_ref_.second    = ref.get_size(7);
+        SET_ref_.second    = ref.get_size(8);
+        SEG_ref_.second    = ref.get_size(9);
+    }
+    catch(...)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
new file mode 100644
index 0000000..0265a5f
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian.h
@@ -0,0 +1,1082 @@
+/** \file   gtPlusISMRMRDReconWorkFlowCartesian.h
+    \brief  Define the base class for the GtPlus reconstruction workflow for cartesian sampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorkFlow.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlowCartesian : public gtPlusISMRMRDReconWorkFlow<T>
+{
+public:
+
+    typedef gtPlusISMRMRDReconWorkFlow<T> BaseClass;
+    typedef typename BaseClass::DimensionRecordType DimensionRecordType;
+
+    gtPlusISMRMRDReconWorkFlowCartesian();
+    virtual ~gtPlusISMRMRDReconWorkFlowCartesian();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool preProcessing();
+
+    virtual bool postProcessing();
+
+    virtual bool configureWorkOrder(const std::vector<ISMRMRDDIM>& dims);
+
+    // resize or cut the reconstruected images to the recon space
+    // res_ [RO E1 CHA SLC E2 ...]
+    virtual bool convertToReconSpace2D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace);
+    // res_ [RO E1 E2 CHA ...]
+    virtual bool convertToReconSpace3D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace);
+
+    // predict the workOrder dimensions
+    virtual bool predictDimensions() = 0;
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::noise_;
+    using BaseClass::noiseBW_;
+    using BaseClass::receriverBWRatio_;
+    using BaseClass::ADCSamplingTimeinSecond_;
+    using BaseClass::overSamplingRatioRO_;
+    using BaseClass::reconSizeRO_;
+    using BaseClass::reconSizeE1_;
+    using BaseClass::reconSizeE2_;
+    using BaseClass::encodingFOV_RO_;
+    using BaseClass::encodingFOV_E1_;
+    using BaseClass::encodingFOV_E2_;
+    using BaseClass::reconFOV_RO_;
+    using BaseClass::reconFOV_E1_;
+    using BaseClass::reconFOV_E2_;
+
+    using BaseClass::res_;
+
+    using BaseClass::worker_;
+    using BaseClass::workOrder_;
+
+    using BaseClass::dimsRes_;
+
+    using BaseClass::dataDimStartingIndexes_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+
+    // the workOrder can share the kernel computation results
+    // if this WorkOrderShareDim_ is not DIM_NONE, then 
+    // workOrders will share kernel estimation results along this 
+    // dimensions
+    ISMRMRDDIM WorkOrderShareDim_;
+
+    // work flow can buffer the kernel computed from previous work order and apply them to other work orders
+    // work flow looks at the workFlow_BufferKernel_ and workFlow_use_BufferedKernel_ fields of work order
+    // buffered kernels
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_kernel_;
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_kernelIm_;
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_unmixingCoeffIm_;
+    boost::shared_ptr< hoNDArray<T> > workFlowBuffer_coilMap_;
+    boost::shared_ptr< std::vector<hoMatrix<T> > > workFlowBuffer_coilCompressionCoef_;
+
+    // whether to perform oversampling removal for ref data
+    bool ref_remove_oversampling_RO_;
+    // whether to apply noise prewhitening on ref data
+    bool ref_apply_noisePreWhitening_;
+
+protected:
+
+    using BaseClass::dataCurr_;
+    using BaseClass::refCurr_;
+
+    using BaseClass::RO_;
+    using BaseClass::E1_;
+    using BaseClass::CHA_;
+    using BaseClass::SLC_;
+    using BaseClass::E2_;
+    using BaseClass::CON_;
+    using BaseClass::PHS_;
+    using BaseClass::REP_;
+    using BaseClass::SET_;
+    using BaseClass::SEG_;
+
+    using BaseClass::RO_ref_;
+    using BaseClass::E1_ref_;
+    using BaseClass::CHA_ref_;
+    using BaseClass::SLC_ref_;
+    using BaseClass::E2_ref_;
+    using BaseClass::CON_ref_;
+    using BaseClass::PHS_ref_;
+    using BaseClass::REP_ref_;
+    using BaseClass::SET_ref_;
+    using BaseClass::SEG_ref_;
+
+    using BaseClass::gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian<T>::
+gtPlusISMRMRDReconWorkFlowCartesian() : BaseClass(), WorkOrderShareDim_(DIM_NONE), ref_remove_oversampling_RO_(true), ref_apply_noisePreWhitening_(true)
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian<T>::~gtPlusISMRMRDReconWorkFlowCartesian() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlowCartesian<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow Cartesian -------------" << endl;
+    os << "Implementation of general reconstruction workflow for cartesian sampling" << endl;
+    os << "Typical PreProcessing includes:" << endl;
+    os << "a) Combine SEG dimension" << endl;
+    os << "b) Remove readout oversampling if any" << endl;
+    os << "c) If input noise scan is available, compute and apply the noise prewhitening matrix" << endl;
+    os << "d) Apply the kspace filter along the RO direction if required" << endl;
+    os << endl;
+    os << "Typical PostProcessing includes:" << endl;
+    os << "a) Apply the kspace filter along the E1 and E2 directions if required" << endl;
+    os << "b) Perform the zero-padding resize if required" << endl;
+    os << endl;
+    os << "Data buffers are named to reflect the typical nature of MR acquisition" << endl;
+    os << "data: image kspace data, 10D array [RO E1 CHA SLC E2 CON PHS REP SET SEG]" << endl;
+    os << "ref: calibration data, 10D array [RO E1 CHA SLC E2 CON PHS REP SET SEG]" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+preProcessing()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *data_, "incomingKSpace");
+
+        // combine the segment dimension
+        if ( SEG_.second > 1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(*data_, dataCurr_));
+            *data_ = dataCurr_;
+            SEG_.second = 1;
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *data_, "incomingKSpace_SEGCombined");
+        }
+
+        if ( (ref_ != NULL) && (ref_->get_number_of_elements()>0) ) { GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *ref_, "incomingRef"); }
+
+        if ( ref_!=NULL && SEG_ref_.second>1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(*ref_, refCurr_));
+            *ref_ = refCurr_;
+            SEG_ref_.second = 1;
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *data_, "incomingRef_SEGCombined");
+        }
+
+        // if needed, remove the readout oversampling
+        if ( overSamplingRatioRO_ > 1.0 )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(*data_));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().cutpad2D(*data_, data_->get_size(0)/overSamplingRatioRO_, data_->get_size(1), dataCurr_));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(dataCurr_));
+            *data_ = dataCurr_;
+            RO_.second = data_->get_size(0);
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *data_, "kspace_oversamplingRORemoved");
+
+            if ( ref_ != NULL && ref_remove_oversampling_RO_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(*ref_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().cutpad2D(*ref_, ref_->get_size(0)/overSamplingRatioRO_, ref_->get_size(1), refCurr_));
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(refCurr_));
+                *ref_ = refCurr_;
+                RO_ref_.second = ref_->get_size(0);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *ref_, "ref_oversamplingRORemoved");
+            }
+
+            if ( workOrder_->start_RO_>=0 && workOrder_->end_RO_>=0 )
+            {
+                workOrder_->start_RO_ /= overSamplingRatioRO_;
+                workOrder_->end_RO_ /= overSamplingRatioRO_;
+            }
+        }
+
+        // if needed, perform the noise prewhitening
+        if ( noise_ != NULL )
+        {
+            hoMatrix<T> prewhiteningMatrix;
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().computeNoisePrewhiteningMatrix(*noise_, noiseBW_, receriverBWRatio_, ADCSamplingTimeinSecond_, prewhiteningMatrix));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().performNoisePrewhitening(*data_, prewhiteningMatrix));
+
+            // GADGET_CHECK_PERFORM(!debugFolder_.empty(), prewhiteningMatrix.print(std::cout));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *data_, "kspace_noiseprewhitenned");
+
+            if ( ref_!=NULL && ref_apply_noisePreWhitening_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().performNoisePrewhitening(*ref_, prewhiteningMatrix));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *ref_, "ref_noiseprewhitenned");
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::preProcessing() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+convertToReconSpace2D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace)
+{
+    try
+    {
+        size_t RO = res_.get_size(0);
+        size_t E1 = res_.get_size(1);
+
+        output_ = input_;
+
+        // if encoded FOV are the same as recon FOV
+        if ( (GT_ABS(encodingFOV_RO_/2 - reconFOV_RO_)<0.1) && (GT_ABS(encodingFOV_E1_-reconFOV_E1_)<0.1) )
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2DOnKSpace(input_, reconSizeRO_, reconSizeE1_, output_));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2D(input_, reconSizeRO_, reconSizeE1_, output_));
+            }
+        }
+        else if (encodingFOV_E1_>=reconFOV_E1_)
+        {
+            size_t encodingE1 = reconSizeE1_;
+            if ( encodingFOV_E1_ > reconFOV_E1_ )
+            {
+                float spacingE1 = reconFOV_E1_/reconSizeE1_;
+                encodingE1 = encodingFOV_E1_/spacingE1;
+            }
+
+            hoNDArray<T>* pSrc = &input_;
+            hoNDArray<T>* pDst = &output_;
+            hoNDArray<T>* pTmp;
+
+            hoNDArray<T> buffer2D;
+
+            // adjust E1
+            if ( encodingE1 > E1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2DOnKSpace(*pSrc, RO, encodingE1, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2D(*pSrc, RO, encodingE1, *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( encodingE1 < E1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad2D(*pSrc, RO, encodingE1, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(*pSrc, buffer2D));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad2D(buffer2D, RO, encodingE1, *pDst));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(*pDst));
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            //adjust RO
+            if ( RO < reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2DOnKSpace(*pSrc, reconSizeRO_, pSrc->get_size(1), *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize2D(*pSrc, reconSizeRO_, pSrc->get_size(1), *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( RO > reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad2D(*pSrc, reconSizeRO_, pSrc->get_size(1), *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(*pSrc, buffer2D));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad2D(buffer2D, reconSizeRO_, pSrc->get_size(1), *pDst));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(*pDst));
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            // final cut
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(*pSrc, buffer2D));
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad2D(buffer2D, reconSizeRO_, reconSizeE1_, *pDst));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(*pDst));
+
+            if ( pDst != &output_ )
+            {
+                output_ = *pDst;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::convertToReconSpace2D(const hoNDArray& input_, hoNDArray& output_, bool isKSpace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+convertToReconSpace3D(hoNDArray<T>& input_, hoNDArray<T>& output_, bool isKSpace)
+{
+    try
+    {
+        size_t RO = res_.get_size(0);
+        size_t E1 = res_.get_size(1);
+        size_t E2 = res_.get_size(2);
+
+        output_ = input_;
+
+        // if encoded FOV are the same as recon FOV
+        if ( (GT_ABS(encodingFOV_RO_/2 - reconFOV_RO_)<0.1) && (GT_ABS(encodingFOV_E1_-reconFOV_E1_)<0.1) && (GT_ABS(encodingFOV_E2_-reconFOV_E2_)<0.1) )
+        {
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(input_, reconSizeRO_, reconSizeE1_, reconSizeE2_, output_));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(input_, reconSizeRO_, reconSizeE1_, reconSizeE2_, output_));
+            }
+        }
+        else if ( (encodingFOV_E1_>=reconFOV_E1_) && (encodingFOV_E2_>=reconFOV_E2_) )
+        {
+            size_t encodingE1 = reconSizeE1_;
+            if ( encodingFOV_E1_ > reconFOV_E1_ )
+            {
+                float spacingE1 = reconFOV_E1_/reconSizeE1_;
+                encodingE1 = std::floor(encodingFOV_E1_/spacingE1+0.5);
+            }
+
+            size_t encodingE2 = reconSizeE2_;
+            if ( encodingFOV_E2_ > reconFOV_E2_ )
+            {
+                float spacingE2 = reconFOV_E2_/reconSizeE2_;
+                encodingE2 = std::floor(encodingFOV_E2_/spacingE2+0.5);
+            }
+
+            hoNDArray<T>* pSrc = &input_;
+            hoNDArray<T>* pDst = &output_;
+            hoNDArray<T>* pTmp;
+
+            hoNDArray<T> buffer3D;
+
+            // adjust E1
+            if ( encodingE1 >= E1+1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(*pSrc, RO, encodingE1, E2, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(*pSrc, RO, encodingE1, E2, *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( encodingE1 <= E1-1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(*pSrc, RO, encodingE1, E2, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pSrc, buffer3D));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(buffer3D, RO, encodingE1, E2, *pDst));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst));
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            // adjust E2
+            if ( encodingE2 >= E2+1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(*pSrc, RO, pSrc->get_size(1), encodingE2, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(*pSrc, RO, pSrc->get_size(1), encodingE2, *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( encodingE2 <= E2-1 )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(*pSrc, RO, pSrc->get_size(1), encodingE2, *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pSrc, buffer3D));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(buffer3D, RO, pSrc->get_size(1), encodingE2, *pDst));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst));
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            //adjust RO
+            if ( RO < reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3DOnKSpace(*pSrc, reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().zpadResize3D(*pSrc, reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), *pDst));
+                }
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+            else if ( RO > reconSizeRO_ )
+            {
+                if ( isKSpace )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(*pSrc, reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), *pDst));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pSrc, buffer3D));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(buffer3D, reconSizeRO_, pSrc->get_size(1), pSrc->get_size(2), *pDst));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst));
+
+                isKSpace = false;
+                pTmp = pSrc; pSrc = pDst; pDst = pTmp;
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *pSrc, "res_beforeCut");
+
+            // final cut on image
+            if ( isKSpace )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pSrc, buffer3D));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(buffer3D, reconSizeRO_, reconSizeE1_, reconSizeE2_, *pDst));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().cutpad3D(*pSrc, reconSizeRO_, reconSizeE1_, reconSizeE2_, *pDst));
+            }
+            // GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(*pDst));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *pDst, "res_AfterCut");
+
+            if ( pDst != &output_ )
+            {
+                output_ = *pDst;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::convertToReconSpace3D(const hoNDArray& input_, hoNDArray& output_, bool isKSpace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+postProcessing()
+{
+    try
+    {
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "complexIm_afterRecon");
+
+        size_t RO = res_.get_size(0);
+        size_t E1 = res_.get_size(1);
+        size_t E2 = res_.get_size(4);
+
+        if ( E2_.second > 1 )
+        {
+            // dataCurr_ = res_;
+
+            // need to permute the matrix order
+            //size_t NDim = dataCurr_.get_number_of_dimensions();
+            //std::vector<size_t> order(NDim, 1);
+
+            //size_t ii;
+            //for ( ii=0; ii<NDim; ii++ )
+            //{
+            //    order[ii] = ii;
+            //}
+
+            //order[0] = 0;
+            //order[1] = 1;
+            //order[2] = 4;
+            //order[3] = 2;
+            //order[4] = 3;
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("postProcessing - permute res array ... "));
+            // boost::shared_ptr< hoNDArray<T> > data_permuted = Gadgetron::permute(const_cast<hoNDArray<T>*>(&dataCurr_), &order);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteE2To3rdDimension(res_, dataCurr_));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, dataCurr_, "data_permuted");
+
+            // dataCurr_ = *data_permuted;
+
+            res_.reshape(dataCurr_.get_dimensions());
+
+            bool inKSpace = false;
+
+            if ( workOrder_->filterROE1E2_.get_size(0)==RO 
+                    && workOrder_->filterROE1E2_.get_size(1)==E1 
+                    && workOrder_->filterROE1E2_.get_size(2)==E2 )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(dataCurr_, res_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterROE1E2(res_, workOrder_->filterROE1E2_, dataCurr_));
+                inKSpace = true;
+            }
+            else if ( (workOrder_->filterRO_.get_number_of_elements() == RO) 
+                        && (workOrder_->filterE1_.get_number_of_elements() == E1) 
+                        && (workOrder_->filterE2_.get_number_of_elements() == E2) )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("postProcessing - fft3c ... "));
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(dataCurr_, res_));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "kspace_beforefiltered");
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("postProcessing - 3D kspace filter ... "));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterROE1E2(res_, workOrder_->filterRO_, workOrder_->filterE1_, workOrder_->filterE2_, dataCurr_));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, dataCurr_, "kspace_afterfiltered");
+                inKSpace = true;
+            }
+            else
+            {
+                hoNDArray<T>* pSrc = &res_;
+                hoNDArray<T>* pDst = &dataCurr_;
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(*pDst, *pSrc));
+
+                bool filterPerformed = false;
+
+                if ( workOrder_->filterRO_.get_number_of_elements() == RO )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterRO(*pSrc, workOrder_->filterRO_, *pDst));
+                    std::swap(pSrc, pDst);
+                    filterPerformed = true;
+                }
+
+                if ( workOrder_->filterE1_.get_number_of_elements() == E1 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterE1(*pSrc, workOrder_->filterE1_, *pDst));
+                    std::swap(pSrc, pDst);
+                    filterPerformed = true;
+                }
+
+                if ( workOrder_->filterE2_.get_number_of_elements() == E2 )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterE2(*pSrc, workOrder_->filterE2_, *pDst));
+                    std::swap(pSrc, pDst);
+                    filterPerformed = true;
+                }
+
+                if ( filterPerformed )
+                {
+                    if ( pDst != &dataCurr_ )
+                    {
+                        dataCurr_ = *pDst;
+                    }
+                }
+                else
+                {
+                    dataCurr_ = res_;
+                }
+
+                inKSpace = true;
+            }
+
+            if ( inKSpace )
+            {
+                if ( !debugFolder_.empty() )
+                {
+                    hoNDArray<T> Im(dataCurr_);
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(Im));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, Im, "complexIm_filtered");
+                }
+            }
+            else
+            {
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "complexIm_filtered");
+            }
+
+            GADGET_CHECK_RETURN_FALSE(convertToReconSpace3D(dataCurr_, res_, inKSpace));
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteE2To5thDimension(res_, dataCurr_));
+
+            //order[0] = 0;
+            //order[1] = 1;
+            //order[2] = 3;
+            //order[3] = 4;
+            //order[4] = 2;
+
+            //data_permuted = Gadgetron::permute(const_cast<hoNDArray<T>*>(&res_), &order);
+            //res_ = *data_permuted;
+
+            res_.reshape(dataCurr_.get_dimensions());
+            memcpy(res_.begin(), dataCurr_.begin(), res_.get_number_of_bytes());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "complexIm_zpadResize3D");
+        }
+        else
+        {
+            dataCurr_ = res_;
+            bool inKSpace = false;
+
+            if ( workOrder_->filterROE1_.get_size(0)==RO && workOrder_->filterROE1_.get_size(1)==E1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterROE1(res_, workOrder_->filterROE1_, dataCurr_));
+                inKSpace = true;
+            }
+            else if ( (workOrder_->filterRO_.get_number_of_elements() == RO) && (workOrder_->filterE1_.get_number_of_elements() == E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterROE1(res_, workOrder_->filterRO_, workOrder_->filterE1_, dataCurr_));
+                inKSpace = true;
+            }
+            else
+            {
+                if ( (workOrder_->filterRO_.get_number_of_elements() == RO) && (workOrder_->filterE1_.get_number_of_elements() != E1) )
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res_));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterRO(res_, workOrder_->filterRO_, dataCurr_));
+                    inKSpace = true;
+                }
+
+                if ( (workOrder_->filterRO_.get_number_of_elements() != RO) && (workOrder_->filterE1_.get_number_of_elements() == E1) )
+                {
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(dataCurr_, res_));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterE1(res_, workOrder_->filterE1_, dataCurr_));
+                    inKSpace = true;
+                }
+            }
+
+            if ( inKSpace )
+            {
+                if ( !debugFolder_.empty() )
+                {
+                    hoNDArray<T> Im(dataCurr_);
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(Im));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, Im, "complexIm_filtered");
+                }
+            }
+            else
+            {
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "complexIm_filtered");
+            }
+
+            GADGET_CHECK_RETURN_FALSE(convertToReconSpace2D(dataCurr_, res_, inKSpace));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "complexIm_zpadResize2D");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::postProcessing() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian<T>::
+configureWorkOrder(const std::vector<ISMRMRDDIM>& dims)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(worker_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(workOrder_!=NULL);
+
+        if ( ref_ == NULL )
+        {
+            ref_ = data_;
+        }
+
+        size_t dd;
+
+        // find the dimension size for data and ref
+        std::vector<size_t> dimSize(dims.size());
+        std::vector<size_t> dimSizeRef(dims.size(), 1);
+        size_t indChannelDim = 2;
+        for ( dd=0; dd<dims.size(); dd++ )
+        {
+            dimSize[dd] = data_->get_size(dims[dd]-DIM_ReadOut);
+            if ( ref_ != NULL )
+            {
+                dimSizeRef[dd] = ref_->get_size(dims[dd]-DIM_ReadOut);
+            }
+
+            if ( dims[dd] == DIM_Channel )
+            {
+                indChannelDim = dd;
+            }
+        }
+
+        GADGET_CONDITION_MSG(!debugFolder_.empty(), "Recon dimensions : " << this->printISMRMRDDimensions(dims));
+        GADGET_CONDITION_MSG(!debugFolder_.empty(), "Recon size       : " << this->printISMRMRDDimensionSize(dimSize));
+        GADGET_CONDITION_MSG(!debugFolder_.empty(), "Recon ref size   : " << this->printISMRMRDDimensionSize(dimSizeRef));
+
+        // recon workOrder size
+        std::vector<size_t> dimReconSize(5);
+        dimReconSize[0] = dimSize[0];
+        dimReconSize[1] = dimSize[1];
+        dimReconSize[2] = dimSize[2];
+        dimReconSize[3] = dimSize[3];
+        dimReconSize[4] = dimSize[4];
+
+        std::vector<size_t> dimReconSizeRef(5);
+        dimReconSizeRef[0] = dimSizeRef[0];
+        dimReconSizeRef[1] = dimSizeRef[1];
+        dimReconSizeRef[2] = dimSizeRef[2];
+        dimReconSizeRef[3] = dimSizeRef[3];
+        dimReconSizeRef[4] = dimSizeRef[4];
+
+        // first two dimension are always RO and E1
+        size_t N2D = dimReconSize[0]*dimReconSize[1];
+        size_t N2DRef = dimReconSizeRef[0]*dimReconSizeRef[1];
+
+        size_t N3D = N2D*dimReconSize[2];
+        size_t N3DRef = N2DRef*dimReconSizeRef[2];
+
+        // allocate the results
+        size_t num_channels_res = workOrder_->num_channels_res_;
+
+        std::vector<size_t> dimResSize(dimSize);
+        dimResSize[indChannelDim] = num_channels_res;
+        res_.create(&dimResSize);
+
+        std::vector<ISMRMRDDIM> dimsRes(dims);
+
+        GADGET_CONDITION_MSG(!debugFolder_.empty(), "Recon res dimensions : " << this->printISMRMRDDimensions(dimsRes));
+        GADGET_CONDITION_MSG(!debugFolder_.empty(), "Recon res size       : " << this->printISMRMRDDimensionSize(dimResSize));
+
+        bool shareAcrossWorkOrders = (WorkOrderShareDim_!=DIM_NONE);
+
+        if ( !debugFolder_.empty() )
+        {
+            gt_exporter_.exportArrayComplex(*data_, debugFolder_ + "data_");
+            gt_exporter_.exportArrayComplex(*ref_, debugFolder_ + "ref_");
+        }
+
+        bool workFlow_use_BufferedKernel_ = workOrder_->workFlow_use_BufferedKernel_;
+
+        // call up the recon
+        size_t dim8, dim7, dim6, dim5, dim4, dim3, dim2;
+        for ( dim8=0; dim8<dimSize[8]; dim8++ )
+        {
+            for ( dim7=0; dim7<dimSize[7]; dim7++ )
+            {
+                for ( dim6=0; dim6<dimSize[6]; dim6++ )
+                {
+                    for ( dim5=0; dim5<dimSize[5]; dim5++ )
+                    {
+                        std::vector<size_t> ind(10, 0);
+                        this->ismrmrdDimIndex9D(ind, dims[8], dim8);
+                        this->ismrmrdDimIndex9D(ind, dims[7], dim7);
+                        this->ismrmrdDimIndex9D(ind, dims[6], dim6);
+                        this->ismrmrdDimIndex9D(ind, dims[5], dim5);
+
+                        if ( !workOrder_->data_.dimensions_equal(&dimReconSize) )
+                        {
+                            workOrder_->data_.create(&dimReconSize);
+                        }
+
+                        std::vector<size_t> indWorkOrder(5, 0);
+                        for ( dim4=0; dim4<dimSize[4]; dim4++ )
+                        {
+                            this->ismrmrdDimIndex9D(ind, dims[4], dim4);
+                            indWorkOrder[4] = dim4;
+
+                            for ( dim3=0; dim3<dimSize[3]; dim3++ )
+                            {
+                                this->ismrmrdDimIndex9D(ind, dims[3], dim3);
+                                indWorkOrder[3] = dim3;
+
+                                if ( dims[2] == DIM_Channel )
+                                {
+                                    long long offset = data_->calculate_offset(ind);
+
+                                    long long offsetWorkOrder = workOrder_->data_.calculate_offset(indWorkOrder);
+
+                                    memcpy(workOrder_->data_.begin()+offsetWorkOrder, data_->begin()+offset, sizeof(T)*N3D);
+                                }
+                                else
+                                {
+                                    for ( dim2=0; dim2<dimSize[2]; dim2++ )
+                                    {
+                                        this->ismrmrdDimIndex9D(ind, dims[2], dim2);
+                                        indWorkOrder[2] = dim2;
+
+                                        long long offset = data_->calculate_offset(ind);
+
+                                        long long offsetWorkOrder = workOrder_->data_.calculate_offset(indWorkOrder);
+
+                                        memcpy(workOrder_->data_.begin()+offsetWorkOrder, data_->begin()+offset, sizeof(T)*N2D);
+                                    }
+                                }
+                            }
+                        }
+
+                        if ( (ref_ != NULL) && (ref_->get_number_of_elements()>0) )
+                        {
+                            std::vector<size_t> indRef(10, 0);
+                            if ( dim8 < dimSizeRef[8] )
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[8], dim8);
+                            }
+                            else
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[8], dimSizeRef[8]-1);
+                            }
+
+                            if ( dim7 < dimSizeRef[7] )
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[7], dim7);
+                            }
+                            else
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[7], dimSizeRef[7]-1);
+                            }
+
+                            if ( dim6 < dimSizeRef[6] )
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[6], dim6);
+                            }
+                            else
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[6], dimSizeRef[6]-1);
+                            }
+
+                            if ( dim5 < dimSizeRef[5] )
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[5], dim5);
+                            }
+                            else
+                            {
+                                this->ismrmrdDimIndex9D(indRef, dims[5], dimSizeRef[5]-1);
+                            }
+
+                            if ( !workOrder_->ref_.dimensions_equal(&dimReconSizeRef) )
+                            {
+                                workOrder_->ref_.create(&dimReconSizeRef);
+                            }
+
+                            std::vector<size_t> indRefWorkOrder(10, 0);
+                            for ( dim4=0; dim4<dimSize[4]; dim4++ )
+                            {
+                                size_t dim4_ref = dim4;
+                                if ( dim4 < dimSizeRef[4] )
+                                {
+                                    this->ismrmrdDimIndex9D(indRef, dims[4], dim4);
+                                }
+                                else
+                                {
+                                    this->ismrmrdDimIndex9D(indRef, dims[4], dimSizeRef[4]-1);
+                                    dim4_ref = dimSizeRef[4]-1;
+                                }
+                                indRefWorkOrder[4] = dim4_ref;
+
+                                for ( dim3=0; dim3<dimSize[3]; dim3++ )
+                                {
+                                    size_t dim3_ref = dim3;
+                                    if ( dim3 < dimSizeRef[3] )
+                                    {
+                                        this->ismrmrdDimIndex9D(indRef, dims[3], dim3);
+                                    }
+                                    else
+                                    {
+                                        this->ismrmrdDimIndex9D(indRef, dims[3], dimSizeRef[3]-1);
+                                        dim3_ref = dimSizeRef[3]-1;
+                                    }
+                                    indRefWorkOrder[3] = dim3_ref;
+
+                                    if ( dims[2] == DIM_Channel )
+                                    {
+                                        long long offset = ref_->calculate_offset(indRef);
+                                        long long offsetWorkOrder = workOrder_->ref_.calculate_offset(indRefWorkOrder);
+                                        memcpy(workOrder_->ref_.begin()+offsetWorkOrder, ref_->begin()+offset, sizeof(T)*N3DRef);
+                                    }
+                                    else
+                                    {
+                                        for ( dim2=0; dim2<dimSize[2]; dim2++ )
+                                        {
+                                            size_t dim2_ref = dim2;
+                                            if ( dim2 < dimSizeRef[2] )
+                                            {
+                                                this->ismrmrdDimIndex9D(indRef, dims[2], dim2);
+                                            }
+                                            else
+                                            {
+                                                this->ismrmrdDimIndex9D(indRef, dims[2], dimSizeRef[2]-1);
+                                                dim2_ref = dimSizeRef[2]-1;
+                                            }
+                                            indRefWorkOrder[2] = dim2_ref;
+
+                                            long long offset = ref_->calculate_offset(indRef);
+                                            long long offsetWorkOrder = workOrder_->ref_.calculate_offset(indRefWorkOrder);
+                                            memcpy(workOrder_->ref_.begin()+offsetWorkOrder, ref_->begin()+offset, sizeof(T)*N2DRef);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        if ( !shareAcrossWorkOrders && workOrder_->workFlow_BufferKernel_ && !workOrder_->workFlow_use_BufferedKernel_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(workOrder_->reset());
+                        }
+
+                        if ( shareAcrossWorkOrders && !workOrder_->workFlow_use_BufferedKernel_ )
+                        {
+                            if ( dim5==0 )
+                            {
+                                workOrder_->workFlow_use_BufferedKernel_ = false;
+                            }
+                            else
+                            {
+                                workOrder_->workFlow_use_BufferedKernel_ = true;
+                            }
+                        }
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder_->data_, "workOrder_data");
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder_->ref_, "workOrder_ref");
+
+                        // trigger the recon
+                        GADGET_CHECK_RETURN_FALSE(worker_->performRecon(workOrder_));
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder_->complexIm_, "workOrder_complexIm");
+
+                        if ( shareAcrossWorkOrders )
+                        {
+                            workOrder_->workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+                        }
+
+                        // copy the results
+                        std::vector<size_t> indRes(ind);
+                        indRes[0] = 0;
+                        indRes[1] = 0;
+                        indRes[2] = 0;
+                        indRes[3] = 0;
+                        indRes[4] = 0;
+                        indRes[5] = dim5;
+                        indRes[6] = dim6;
+                        indRes[7] = dim7;
+                        indRes[8] = dim8;
+
+                        long long offset = res_.calculate_offset(indRes);
+                        memcpy(res_.begin()+offset, workOrder_->complexIm_.begin(), workOrder_->complexIm_.get_number_of_bytes());
+
+                        // if not sharing across work order
+                        if ( !shareAcrossWorkOrders && !workOrder_->workFlow_use_BufferedKernel_ && !workOrder_->workFlow_BufferKernel_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(workOrder_->reset());
+                        }
+                    }
+
+                    // in the outter dimensions, the work order is always reset
+                    if ( !workOrder_->workFlow_use_BufferedKernel_ && !workOrder_->workFlow_BufferKernel_ )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(workOrder_->reset());
+                    }
+                }
+            }
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "res_afterunwrapping");
+
+        // permute the res_ to the correct dimension order
+        if (   ( (res_.get_number_of_elements()>dimResSize[0]*dimResSize[1]) && (dims[2]!=DIM_Channel) ) 
+            || ( (res_.get_number_of_elements()>dimResSize[0]*dimResSize[1]*dimResSize[2])             ) )
+        {
+            std::vector<size_t> order;
+            GADGET_CHECK_RETURN_FALSE(this->findISMRMRDPermuteOrder(dimsRes, dimsRes_, order));
+
+            boost::shared_ptr< hoNDArray<T> > res_permuted = Gadgetron::permute(&res_, &order);
+            res_.reshape(res_permuted->get_dimensions());
+            memcpy(res_.begin(), res_permuted->begin(), res_permuted->get_number_of_bytes());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res_, "res_afterPermute");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian<T>::configureWorkOrder(const std::vector<ISMRMRDDIM>& dims) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
new file mode 100644
index 0000000..9e6bc25
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian2DT.h
@@ -0,0 +1,277 @@
+/** \file   gtPlusISMRMRDReconWorkFlowCartesian2DT.h
+    \brief  Define the base class for the GtPlus 2DT reconstruction workflow for cartesian sampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorkFlowCartesian.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlowCartesian2DT : public gtPlusISMRMRDReconWorkFlowCartesian<T>
+{
+public:
+
+    typedef gtPlusISMRMRDReconWorkFlowCartesian<T> BaseClass;
+    typedef typename BaseClass::DimensionRecordType DimensionRecordType;
+
+    gtPlusISMRMRDReconWorkFlowCartesian2DT();
+    virtual ~gtPlusISMRMRDReconWorkFlowCartesian2DT();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool recon();
+
+    virtual bool predictDimensions();
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::noise_;
+    using BaseClass::noiseBW_;
+    using BaseClass::receriverBWRatio_;
+    using BaseClass::overSamplingRatioRO_;
+    using BaseClass::reconSizeRO_;
+    using BaseClass::reconSizeE1_;
+    using BaseClass::reconSizeE2_;
+    using BaseClass::encodingFOV_RO_;
+    using BaseClass::encodingFOV_E1_;
+    using BaseClass::encodingFOV_E2_;
+    using BaseClass::reconFOV_RO_;
+    using BaseClass::reconFOV_E1_;
+    using BaseClass::reconFOV_E2_;
+    using BaseClass::res_;
+
+    using BaseClass::worker_;
+    using BaseClass::workOrder_;
+
+    using BaseClass::dimsRes_;
+
+    using BaseClass::dataDimStartingIndexes_;
+
+    using BaseClass::WorkOrderShareDim_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+
+    using BaseClass::ref_remove_oversampling_RO_;
+    using BaseClass::ref_apply_noisePreWhitening_;
+
+    // the workOrder2D needs 5 dimensions [RO E1 CHA N S]
+    ISMRMRDDIM dim4th_;
+    ISMRMRDDIM dim5th_;
+
+protected:
+
+    using BaseClass::dataCurr_;
+    using BaseClass::refCurr_;
+
+    using BaseClass::RO_;
+    using BaseClass::E1_;
+    using BaseClass::CHA_;
+    using BaseClass::SLC_;
+    using BaseClass::E2_;
+    using BaseClass::CON_;
+    using BaseClass::PHS_;
+    using BaseClass::REP_;
+    using BaseClass::SET_;
+    using BaseClass::SEG_;
+
+    using BaseClass::RO_ref_;
+    using BaseClass::E1_ref_;
+    using BaseClass::CHA_ref_;
+    using BaseClass::SLC_ref_;
+    using BaseClass::E2_ref_;
+    using BaseClass::CON_ref_;
+    using BaseClass::PHS_ref_;
+    using BaseClass::REP_ref_;
+    using BaseClass::SET_ref_;
+    using BaseClass::SEG_ref_;
+
+    using BaseClass::gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::
+gtPlusISMRMRDReconWorkFlowCartesian2DT() : BaseClass(), dim4th_(DIM_NONE), dim5th_(DIM_NONE)
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::~gtPlusISMRMRDReconWorkFlowCartesian2DT() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow Cartesian 2D/2DT -------------" << endl;
+    os << "Implementation of general reconstruction workflow for cartesian sampling of 2D and 2D+T use cases" << endl;
+    os << "The workOrder needs 5 dimensions [RO E1 CHA N S]" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::predictDimensions()
+{
+    // if interleaved mode
+    if ( workOrder_->CalibMode_ == ISMRMRD_interleaved )
+    {
+        if ( workOrder_->InterleaveDim_ == DIM_Phase )
+        {
+            dim4th_ = DIM_Phase;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Repetition )
+        {
+            dim4th_ = DIM_Repetition;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Contrast )
+        {
+            dim4th_ = DIM_Contrast;
+        }
+
+        if ( CON_.second==1 && SET_.second==1 )
+        {
+            dim5th_ = DIM_Slice;
+        }
+
+        if ( CON_.second>1 && SET_.second==1 )
+        {
+            dim5th_ = DIM_Contrast;
+        }
+
+        if ( CON_.second==1 && SET_.second>1 )
+        {
+            dim5th_ = DIM_Set;
+        }
+
+        if ( CON_.second>1 && SET_.second>1 )
+        {
+            dim5th_ = DIM_Contrast;
+            WorkOrderShareDim_ = DIM_Set;
+        }
+    }
+    else if ( (workOrder_->CalibMode_ == ISMRMRD_embedded) 
+        || (workOrder_->CalibMode_ == ISMRMRD_separate) 
+        || (workOrder_->CalibMode_ == ISMRMRD_noacceleration) ) 
+    {
+            std::vector<DimensionRecordType> dimSizes(4);
+            dimSizes[0] = CON_;
+            dimSizes[1] = PHS_;
+            dimSizes[2] = REP_;
+            dimSizes[3] = SET_;
+
+            std::sort(dimSizes.begin(), dimSizes.end(), DimensionRecordCompare() );
+
+            dim4th_ = dimSizes[0].first;
+            dim5th_ = dimSizes[1].first;
+
+            if ( dimSizes[2].second > 1 )
+            {
+                WorkOrderShareDim_ = dimSizes[2].first;
+            }
+
+            if ( dimSizes[1].second==1 && dimSizes[2].second==1 && dimSizes[3].second==1 )
+            {
+                dim5th_ = DIM_Slice;
+            }
+    }
+
+    if ( dim4th_==DIM_NONE || dim5th_==DIM_NONE )
+    {
+        GADGET_ERROR_MSG("gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::predictDimensions() : cannot find 4th and 5th dimensions ... ");
+        return false;
+    }
+
+    workOrder_->enforceConsistency(dim5th_);
+
+    GADGET_CONDITION_MSG(true, "predictDimensions - dim4th : " << gtPlus_util_.getISMRMRDDimName(dim4th_) );
+    GADGET_CONDITION_MSG(true, "predictDimensions - dim5th : " << gtPlus_util_.getISMRMRDDimName(dim5th_) );
+    GADGET_CONDITION_MSG(true, "predictDimensions - WorkOrderShareDim : " << gtPlus_util_.getISMRMRDDimName(WorkOrderShareDim_) );
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::recon()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(worker_!=NULL);
+
+        if ( dim4th_==DIM_NONE || dim5th_==DIM_NONE )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->predictDimensions());
+        }
+
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_ReadOut);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Encoding1);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Channel);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=dim4th_);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=dim5th_);
+
+        // find recon dimensions
+        std::vector<ISMRMRDDIM> dims;
+        dims.push_back(DIM_ReadOut);
+        dims.push_back(DIM_Encoding1);
+        dims.push_back(DIM_Channel);
+        dims.push_back(dim4th_);
+        dims.push_back(dim5th_);
+
+        // ISMRMRDDIM dim;
+        int dim;
+        size_t dd;
+
+        int indWorkOrderSharingDim = -1;
+        for ( dim=DIM_Slice; dim<=DIM_Set; dim++ )
+        {
+            bool exist = false;
+            for ( dd=0; dd<dims.size(); dd++ )
+            {
+                if ( dims[dd] == (ISMRMRDDIM)dim )
+                {
+                    exist = true;
+                    break;
+                }
+            }
+
+            if ( !exist )
+            {
+                dims.push_back((ISMRMRDDIM)dim);
+
+                if ( dim == WorkOrderShareDim_ )
+                {
+                    indWorkOrderSharingDim = dims.size()-1;
+                }
+            }
+        }
+
+        if ( (indWorkOrderSharingDim!=-1) && (indWorkOrderSharingDim > 5) )
+        {
+            ISMRMRDDIM dim6th = dims[5];
+            dims[5] = WorkOrderShareDim_;
+            dims[indWorkOrderSharingDim] = dim6th;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->configureWorkOrder(dims));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian2DT<T>::recon() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
new file mode 100644
index 0000000..6f46492
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkFlowCartesian3DT.h
@@ -0,0 +1,247 @@
+/** \file   gtPlusISMRMRDReconWorkFlowCartesian3DT.h
+    \brief  Define the base class for the GtPlus 3DT reconstruction workflow for cartesian sampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorkFlowCartesian.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusISMRMRDReconWorkFlowCartesian3DT : public gtPlusISMRMRDReconWorkFlowCartesian<T>
+{
+public:
+
+    typedef gtPlusISMRMRDReconWorkFlowCartesian<T> BaseClass;
+    typedef typename BaseClass::DimensionRecordType DimensionRecordType;
+
+    gtPlusISMRMRDReconWorkFlowCartesian3DT();
+    virtual ~gtPlusISMRMRDReconWorkFlowCartesian3DT();
+
+    void printInfo(std::ostream& os);
+
+    virtual bool recon();
+
+    virtual bool predictDimensions();
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::noise_;
+    using BaseClass::noiseBW_;
+    using BaseClass::receriverBWRatio_;
+    using BaseClass::overSamplingRatioRO_;
+    using BaseClass::reconSizeRO_;
+    using BaseClass::reconSizeE1_;
+    using BaseClass::reconSizeE2_;
+    using BaseClass::encodingFOV_RO_;
+    using BaseClass::encodingFOV_E1_;
+    using BaseClass::encodingFOV_E2_;
+    using BaseClass::reconFOV_RO_;
+    using BaseClass::reconFOV_E1_;
+    using BaseClass::reconFOV_E2_;
+    using BaseClass::res_;
+
+    using BaseClass::worker_;
+    using BaseClass::workOrder_;
+
+    using BaseClass::dataDimStartingIndexes_;
+
+    using BaseClass::WorkOrderShareDim_;
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+
+    using BaseClass::ref_remove_oversampling_RO_;
+    using BaseClass::ref_apply_noisePreWhitening_;
+
+    // the workOrder3DT needs 5 dimensions [RO E1 E2 CHA S]
+    ISMRMRDDIM dim5th_;
+
+protected:
+
+    using BaseClass::dataCurr_;
+    using BaseClass::refCurr_;
+
+    using BaseClass::RO_;
+    using BaseClass::E1_;
+    using BaseClass::CHA_;
+    using BaseClass::SLC_;
+    using BaseClass::E2_;
+    using BaseClass::CON_;
+    using BaseClass::PHS_;
+    using BaseClass::REP_;
+    using BaseClass::SET_;
+    using BaseClass::SEG_;
+
+    using BaseClass::RO_ref_;
+    using BaseClass::E1_ref_;
+    using BaseClass::CHA_ref_;
+    using BaseClass::SLC_ref_;
+    using BaseClass::E2_ref_;
+    using BaseClass::CON_ref_;
+    using BaseClass::PHS_ref_;
+    using BaseClass::REP_ref_;
+    using BaseClass::SET_ref_;
+    using BaseClass::SEG_ref_;
+
+    using BaseClass::gtPlus_util_;
+};
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::
+gtPlusISMRMRDReconWorkFlowCartesian3DT() : BaseClass(), dim5th_(DIM_NONE)
+{
+}
+
+template <typename T> 
+gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::~gtPlusISMRMRDReconWorkFlowCartesian3DT() 
+{
+}
+
+template <typename T> 
+void gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::printInfo(std::ostream& os)
+{
+    using namespace std;
+
+    os << "-------------- GTPlus ISMRMRD Recon workflow Cartesian 3D/3DT -------------" << endl;
+    os << "Implementation of general reconstruction workflow for cartesian sampling of 3D and 3D+T use cases" << endl;
+    os << "The workOrder needs 5 dimensions [RO E1 E2 CHA S]" << endl;
+    os << "----------------------------------------------------------" << endl;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::predictDimensions()
+{
+    // if interleaved mode
+    if ( workOrder_->CalibMode_ == ISMRMRD_interleaved )
+    {
+        if ( workOrder_->InterleaveDim_ == DIM_Phase )
+        {
+            dim5th_ = DIM_Phase;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Repetition )
+        {
+            dim5th_ = DIM_Repetition;
+        }
+
+        if ( workOrder_->InterleaveDim_ == DIM_Contrast )
+        {
+            dim5th_ = DIM_Contrast;
+        }
+    }
+    else if ( (workOrder_->CalibMode_ == ISMRMRD_embedded) 
+        || (workOrder_->CalibMode_ == ISMRMRD_separate)
+        || (workOrder_->CalibMode_ == ISMRMRD_noacceleration) ) 
+    {
+        if ( SLC_.second == 1 )
+        {
+            std::vector<DimensionRecordType> dimSizes(4);
+            dimSizes[0] = CON_;
+            dimSizes[1] = PHS_;
+            dimSizes[2] = REP_;
+            dimSizes[3] = SET_;
+
+            std::sort(dimSizes.begin(), dimSizes.end(), DimensionRecordCompare() );
+            dim5th_ = dimSizes[0].first;
+        }
+
+        if (SLC_.second > 1 )
+        {
+            dim5th_ = DIM_Slice; // multiple slab acquisition
+        }
+    }
+
+    if ( dim5th_==DIM_NONE )
+    {
+        GADGET_ERROR_MSG("gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::predictDimensions() : cannot find 5th dimensions ... ");
+        return false;
+    }
+
+    workOrder_->enforceConsistency(dim5th_);
+
+    GADGET_CONDITION_MSG(true, "predictDimensions - dim5th : " << gtPlus_util_.getISMRMRDDimName(dim5th_) );
+    GADGET_CONDITION_MSG(true, "predictDimensions - WorkOrderShareDim : " << gtPlus_util_.getISMRMRDDimName(WorkOrderShareDim_) );
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::recon()
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(data_!=NULL);
+        GADGET_CHECK_RETURN_FALSE(worker_!=NULL);
+
+        if ( dim5th_==DIM_NONE )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->predictDimensions());
+        }
+
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_ReadOut);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Encoding1);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Encoding2);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=DIM_Channel);
+        GADGET_CHECK_RETURN_FALSE(WorkOrderShareDim_!=dim5th_);
+
+        // find recon dimensions
+        std::vector<ISMRMRDDIM> dims;
+        dims.push_back(DIM_ReadOut);
+        dims.push_back(DIM_Encoding1);
+        dims.push_back(DIM_Encoding2);
+        dims.push_back(DIM_Channel);
+        dims.push_back(dim5th_);
+
+        int dim;
+        size_t dd;
+
+        int indWorkOrderSharingDim = -1;
+        for ( dim=DIM_Slice; dim<=DIM_Set; dim++ )
+        {
+            bool exist = false;
+            for ( dd=0; dd<dims.size(); dd++ )
+            {
+                if ( dims[dd] == dim )
+                {
+                    exist = true;
+                    break;
+                }
+            }
+
+            if ( !exist )
+            {
+                dims.push_back((ISMRMRDDIM)dim);
+
+                if ( dim == WorkOrderShareDim_ )
+                {
+                    indWorkOrderSharingDim = dims.size()-1;
+                }
+            }
+        }
+
+        if ( (indWorkOrderSharingDim!=-1) && (indWorkOrderSharingDim > 5) )
+        {
+            ISMRMRDDIM dim6th = dims[5];
+            dims[5] = WorkOrderShareDim_;
+            dims[indWorkOrderSharingDim] = dim6th;
+        }
+
+        GADGET_CHECK_RETURN_FALSE(this->configureWorkOrder(dims));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusISMRMRDReconWorkFlowCartesian3DT<T>::recon() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h
new file mode 100644
index 0000000..42c42e5
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder.h
@@ -0,0 +1,871 @@
+/** \file   gtPlusISMRMRDReconWorkOrder.h
+    \brief  Define the GtPlus reconstruction workorder and parameters
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "gtPlusISMRMRDReconUtil.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+struct gtPlusReconWorkOrderPara
+{
+    ISMRMRDCALIBMODE CalibMode_;
+    ISMRMRDDIM InterleaveDim_;
+
+    // acceleration factor along E1 and E2
+    double acceFactorE1_;
+    double acceFactorE2_;
+
+    // kspace center for RO/E1/E2
+    size_t kSpaceCenterRO_;
+    size_t kSpaceCenterEncode1_;
+    size_t kSpaceCenterEncode2_;
+
+    // kspace max acquired number for RO/E1/E2
+    size_t kSpaceMaxRO_;
+    size_t kSpaceMaxEncode1_;
+    size_t kSpaceMaxEncode2_;
+
+    // for asymmetric echo
+    // the sampled range for RO
+    // if <0, all RO ranges are used
+    int start_RO_;
+    int end_RO_;
+
+    // sampled range for E1
+    int start_E1_;
+    int end_E1_;
+
+    // sampled range for E2
+    int start_E2_;
+    int end_E2_;
+
+    // work order has to have some interaction with work flow
+    // if true work flow will buffer kernel computed from this work order
+    bool workFlow_BufferKernel_;
+    // if true, work flow will use its buffered kernel for this work order
+    bool workFlow_use_BufferedKernel_;
+
+    // number of channels for the reconstruction results
+    // most cases, it is 1
+    size_t num_channels_res_;
+
+    // -----------------------------------------------------------------------
+    // parameters
+    // -----------------------------------------------------------------------
+
+    // -------------------------------
+    // coil compression
+    // -------------------------------
+    bool upstream_coil_compression_;
+    double upstream_coil_compression_thres_;
+    int upstream_coil_compression_num_modesKept_;
+
+    bool downstream_coil_compression_;
+    double coil_compression_thres_;
+    int coil_compression_num_modesKept_;
+
+    // -------------------------------
+    // coil sensitivity estimation
+    // -------------------------------
+    Gadgetron::gtPlus::ISMRMRDCOILMAPALGO coil_map_algorithm_;
+
+    // for ISMRMRD_SOUHEIL
+    size_t csm_kSize_;
+    size_t csm_powermethod_num_;
+    // for 3D acquisition, whether to use the true 3D data correlation matrix
+    bool csm_true_3D_;
+
+    // for ISMRMRD_SOUHEIL_ITER
+    size_t csm_iter_num_;
+    double csm_iter_thres_;
+
+    // whether to use gpu for csm estimation
+    bool csm_use_gpu_;
+
+    // -------------------------------
+    // parameters for variant reconstruction algorithms
+    // -------------------------------
+    Gadgetron::gtPlus::ISMRMRDALGO recon_algorithm_;
+    bool recon_auto_parameters_;
+
+    // grappa
+    size_t grappa_kSize_RO_;
+    size_t grappa_kSize_E1_;
+    size_t grappa_kSize_E2_;
+    double grappa_reg_lamda_;
+    double grappa_calib_over_determine_ratio_;
+    bool grappa_use_gpu_;
+
+    // sense
+
+    // soft sense
+
+    // SPIRiT
+    size_t spirit_kSize_RO_;
+    size_t spirit_kSize_E1_;
+    size_t spirit_kSize_E2_;
+
+    double spirit_reg_lamda_;
+    double spirit_calib_over_determine_ratio_;
+
+    bool spirit_solve_symmetric_;
+
+    size_t spirit_iter_max_;
+    double spirit_iter_thres_;
+    bool spirit_print_iter_;
+
+    bool spirit_use_gpu_;
+
+    // L1 SPIRiT
+    bool spirit_perform_linear_;
+    bool spirit_perform_nonlinear_;
+
+    double spirit_parallel_imaging_lamda_;
+    double spirit_image_reg_lamda_;
+    double spirit_data_fidelity_lamda_;
+
+    size_t spirit_ncg_iter_max_;
+    double spirit_ncg_iter_thres_;
+    bool spirit_ncg_print_iter_;
+    double spirit_ncg_scale_factor_;
+
+    bool spirit_use_coil_sen_map_;
+    bool spirit_use_moco_enhancement_;
+    bool spirit_recon_moco_images_;
+
+    bool spirit_2D_scale_per_chunk_;
+    bool spirit_3D_scale_per_chunk_;
+
+    double spirit_RO_enhancement_ratio_;
+    double spirit_E1_enhancement_ratio_;
+    double spirit_E2_enhancement_ratio_;
+    double spirit_temporal_enhancement_ratio_;
+
+    // L1 soft sense
+
+    // -------------------------------
+    // job split
+    // -------------------------------
+    bool job_split_by_S_;
+    size_t job_num_of_N_;
+    size_t job_max_Megabytes_;
+    size_t job_overlap_;
+    // whether to perform computation on the control node
+    bool job_perform_on_control_node_;
+
+    // -------------------------------
+    // partial fourier handling
+    // -------------------------------
+    // partial fourier handling algorithms
+    ISMRMRDPFALGO partialFourier_algo_;
+
+    // homodyne filter
+    // number of iterations
+    size_t partialFourier_homodyne_iters_;
+    // threshold to stop the iteration
+    double partialFourier_homodyne_thres_;
+    // density compensation for homodyne filter results
+    bool partialFourier_homodyne_densityComp_;
+
+    // POCS
+    // number of iterations
+    size_t partialFourier_POCS_iters_;
+    // threshold to stop the iteration
+    double partialFourier_POCS_thres_;
+    // transit band width
+    size_t partialFourier_POCS_transitBand_;
+    // transit band width for E2
+    size_t partialFourier_POCS_transitBand_E2_;
+
+    // Feng Huang method
+    // kernel size
+    size_t partialFourier_FengHuang_kSize_RO_;
+    size_t partialFourier_FengHuang_kSize_E1_;
+    size_t partialFourier_FengHuang_kSize_E2_;
+    // threshold for kernel estimation
+    double partialFourier_FengHuang_thresReg_;
+    // same kernel for all N
+    bool partialFourier_FengHuang_sameKernel_allN_;
+    // transit band width
+    size_t partialFourier_FengHuang_transitBand_;
+    // transit band width for E2
+    size_t partialFourier_FengHuang_transitBand_E2_;
+
+    gtPlusReconWorkOrderPara()
+    {
+        CalibMode_ = ISMRMRD_noacceleration;
+        InterleaveDim_ = DIM_NONE;
+
+        acceFactorE1_ = 1;
+        acceFactorE2_ = 1;
+
+        kSpaceCenterRO_ = 0;
+        kSpaceCenterEncode1_ = 0;
+        kSpaceCenterEncode2_ = 0;
+
+        kSpaceMaxRO_ = 1;
+        kSpaceMaxEncode1_ = 1;
+        kSpaceMaxEncode2_ = 1;
+
+        start_RO_ = -1;
+        end_RO_ = -1;
+
+        start_E1_ = -1;
+        end_E1_ = -1;
+
+        start_E2_ = -1;
+        end_E2_ = -1;
+
+        workFlow_BufferKernel_ = false;
+        workFlow_use_BufferedKernel_ = false;
+
+        num_channels_res_ = 1;
+
+        upstream_coil_compression_ = false;
+        upstream_coil_compression_thres_ = 1e-3;
+        upstream_coil_compression_num_modesKept_ = -1;
+
+        downstream_coil_compression_ = true;
+        coil_compression_thres_ = 1e-3;
+        coil_compression_num_modesKept_ = -1;
+
+        coil_map_algorithm_ = ISMRMRD_SOUHEIL;
+        csm_kSize_ = 7;
+        csm_powermethod_num_ = 3;
+        csm_true_3D_ = false;
+        csm_iter_num_ = 5;
+        csm_iter_thres_ = 1e-3;
+        csm_use_gpu_ = true;
+
+        recon_algorithm_ = ISMRMRD_GRAPPA;
+        recon_auto_parameters_ = true;
+
+        grappa_kSize_RO_ = 5;
+        grappa_kSize_E1_ = 4;
+        grappa_kSize_E2_ = 4;
+        grappa_reg_lamda_ = 0.0005;
+        grappa_calib_over_determine_ratio_ = 0;
+        grappa_use_gpu_ = true;
+
+        spirit_kSize_RO_ = 7;
+        spirit_kSize_E1_ = 7;
+        spirit_kSize_E2_ = 7;
+
+        spirit_reg_lamda_ = 0.005;
+        spirit_calib_over_determine_ratio_ = 0;
+
+        spirit_use_gpu_ = true;
+
+        spirit_solve_symmetric_ = false;
+
+        spirit_iter_max_ = 70;
+        spirit_iter_thres_ = 1e-5;
+        spirit_print_iter_ = false;
+
+        spirit_perform_linear_ = true;
+        spirit_perform_nonlinear_ = true;
+
+        spirit_parallel_imaging_lamda_ = 1.0;
+        spirit_image_reg_lamda_ = 1e-3;
+        spirit_data_fidelity_lamda_ = 0;
+
+        spirit_ncg_iter_max_ = 10;
+        spirit_ncg_iter_thres_ = 1e-3;
+        spirit_ncg_print_iter_ = false;
+        spirit_ncg_scale_factor_ = 1.0;
+
+        spirit_use_coil_sen_map_ = true;
+        spirit_use_moco_enhancement_ = false;
+        spirit_recon_moco_images_ = false;
+
+        spirit_RO_enhancement_ratio_ = 1;
+        spirit_E1_enhancement_ratio_ = 1;
+        spirit_E2_enhancement_ratio_ = 1;
+        spirit_temporal_enhancement_ratio_ = 1;
+
+        spirit_2D_scale_per_chunk_ = false;
+        spirit_3D_scale_per_chunk_ = true;
+
+        job_split_by_S_ = false;
+        job_num_of_N_ = 0;
+        job_max_Megabytes_ = 20*1024;
+        job_overlap_ = 2;
+        job_perform_on_control_node_ = true;
+
+        partialFourier_algo_ = ISMRMRD_PF_ZEROFILLING_FILTER;
+
+        partialFourier_homodyne_iters_ = 6;
+        partialFourier_homodyne_thres_ = 1e-2;
+        partialFourier_homodyne_densityComp_ = false;
+
+        partialFourier_POCS_iters_ = 6;
+        partialFourier_POCS_thres_ = 1e-2;
+        partialFourier_POCS_transitBand_ = 16;
+        partialFourier_POCS_transitBand_E2_ = 16;
+
+        partialFourier_FengHuang_kSize_RO_ = 5;
+        partialFourier_FengHuang_kSize_E1_ = 5;
+        partialFourier_FengHuang_kSize_E2_ = 5;
+        partialFourier_FengHuang_thresReg_ = 0.005;
+        partialFourier_FengHuang_sameKernel_allN_ = false;
+        partialFourier_FengHuang_transitBand_ = 16;
+        partialFourier_FengHuang_transitBand_E2_ = 16;
+    }
+
+    ~gtPlusReconWorkOrderPara() {}
+};
+
+
+
+template <typename T> 
+class gtPlusReconWorkOrder : public gtPlusReconWorkOrderPara
+{
+public:
+
+    gtPlusReconWorkOrder();
+    virtual ~gtPlusReconWorkOrder();
+
+    // reset the status of work order
+    // all computed calibration/coil sensitivity results
+    // are deleted
+    virtual bool reset();
+
+    // check and modify inconsistency in the work order
+    virtual bool enforceConsistency(ISMRMRDDIM& /*lastDim*/);
+
+    typedef std::pair<ISMRMRDDIM, size_t> DimensionRecordType;
+
+    // duplicate a workorder without copying the data arrays
+    virtual void duplicatePara(gtPlusReconWorkOrderPara& worder) const;
+    virtual void duplicate(gtPlusReconWorkOrder<T>& worder) const;
+
+    virtual void copyFromPara(const gtPlusReconWorkOrderPara& worder);
+
+    virtual void printInfo(std::ostream& os) const;
+    virtual void print(std::ostream& os) const;
+
+    // -------------------------------
+    // input
+    // -------------------------------
+    // kspace data
+    hoNDArray<T> data_;
+    // ref data
+    hoNDArray<T> ref_;
+
+    // noise data
+    hoNDArray<T> noise_;
+
+    // phase correction data
+    hoNDArray<T> phaseCorr_;
+
+    // other data
+    hoNDArray<T> other_;
+
+    // dimension starting indexes for the data_
+    std::vector< DimensionRecordType > dataDimStartingIndexes_;
+
+    // to support EPI and other trajectories
+    // if 1, the readout line is acquired inversely, otherwise, 0
+    hoNDArray<unsigned short> reflect_;
+    hoNDArray<unsigned short> reflect_ref_;
+    hoNDArray<unsigned short> reflect_phaseCorr_;
+    hoNDArray<unsigned short> reflect_other_;
+
+    // -------------------------------
+    // output
+    // -------------------------------
+    // reconstructed kspace
+    hoNDArray<T> fullkspace_;
+
+    // reconstructed images
+    hoNDArray<T> complexIm_;
+
+    // gfactor
+    hoNDArray<T> gfactor_;
+
+    // -------------------------------
+    // buffers for computation
+    // -------------------------------
+    // ref for recon
+    hoNDArray<T> ref_recon_;
+    // ref for coil map
+    hoNDArray<T> ref_coil_map_;
+
+    // store the estimated kernel, kernel in image domain
+    // if these fields are set before recon, they will be used
+    boost::shared_ptr< hoNDArray<T> > kernel_; // [RO E1 srcCHA dstCHA dstE1 1 or N S]
+    boost::shared_ptr< hoNDArray<T> > kernelIm_; // [RO E1 srcCHA dstCHA 1 or N S]
+    boost::shared_ptr< hoNDArray<T> > unmixingCoeffIm_; // [RO E1 srcCHA 1 or N S]
+    boost::shared_ptr< std::vector<hoMatrix<T> > > coilCompressionCoef_; // [dstCHA srcCHA] matrices
+    boost::shared_ptr< hoNDArray<T> > coilMap_; // [RO E1 dstCHA 1 or N S]
+
+    // -------------------------------
+    // kspace filter for RO/E1/E2 dimension, applied to the reconstruction results
+    // -------------------------------
+    // 1D filter for kspace data
+    hoNDArray<T> filterRO_;
+    hoNDArray<T> filterE1_;
+    hoNDArray<T> filterE2_;
+    // 2D and 3D filter, overwrite the 1D filters
+    hoNDArray<T> filterROE1_;
+    hoNDArray<T> filterROE1E2_;
+
+    // -------------------------------
+    // kspace filter for RO/E1/E2 dimension, applied to the ref data for coil map estimation
+    // -------------------------------
+    // filter for ref data
+    hoNDArray<T> filterRO_ref_;
+    hoNDArray<T> filterE1_ref_;
+    hoNDArray<T> filterE2_ref_;
+
+    hoNDArray<T> filterROE1_ref_;
+    hoNDArray<T> filterROE1E2_ref_;
+
+    // -------------------------------
+    // kspace filter for RO/E1/E2 dimension, applied to the data edge in case of partial fourier or asymmetric echo
+    // -------------------------------
+    // filter for partial fourier/asymmetric echo
+    hoNDArray<T> filterRO_partialfourier_;
+    hoNDArray<T> filterE1_partialfourier_;
+    hoNDArray<T> filterE2_partialfourier_;
+
+    hoNDArray<T> filterROE1_partialfourier_;
+    hoNDArray<T> filterROE1E2_partialfourier_;
+
+    // -------------------------------
+    // parameters for cloud computing
+    // -------------------------------
+    bool CloudComputing_;
+    unsigned int CloudSize_;
+
+    typedef boost::tuple<std::string, std::string, std::string, unsigned int> CloudNodeType;
+    typedef std::vector<CloudNodeType> CloudType;
+
+    CloudType gt_cloud_;
+};
+
+template <typename T> 
+gtPlusReconWorkOrder<T>::gtPlusReconWorkOrder() : gtPlusReconWorkOrderPara()
+{
+    hoNDArray<T>* tmp = new hoNDArray<T>();
+    kernel_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    tmp = new hoNDArray<T>();
+    kernelIm_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    tmp = new hoNDArray<T>();
+    unmixingCoeffIm_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    std::vector<hoMatrix<T> >* tmpCoilCoef = new std::vector<hoMatrix<T> >();
+    coilCompressionCoef_ = boost::shared_ptr< std::vector<hoMatrix<T> > >(tmpCoilCoef);
+
+    tmp = new hoNDArray<T>();
+    coilMap_ = boost::shared_ptr< hoNDArray<T> >(tmp);
+
+    CloudComputing_ = false;
+    CloudSize_ = 0;
+}
+
+template <typename T> 
+gtPlusReconWorkOrder<T>::~gtPlusReconWorkOrder()
+{
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder<T>::reset()
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder<T>::enforceConsistency(ISMRMRDDIM& /*lastDim*/)
+{
+    return true;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::duplicatePara(gtPlusReconWorkOrderPara& worder) const
+{
+    worder.CalibMode_ = CalibMode_;
+    worder.InterleaveDim_ = InterleaveDim_;
+
+    worder.acceFactorE1_ = acceFactorE1_;
+    worder.acceFactorE2_ = acceFactorE2_;
+
+    worder.kSpaceCenterRO_ = kSpaceCenterRO_;
+    worder.kSpaceCenterEncode1_ = kSpaceCenterEncode1_;
+    worder.kSpaceCenterEncode2_ = kSpaceCenterEncode2_;
+
+    worder.kSpaceMaxRO_ = kSpaceMaxRO_;
+    worder.kSpaceMaxEncode1_ = kSpaceMaxEncode1_;
+    worder.kSpaceMaxEncode2_ = kSpaceMaxEncode2_;
+
+    worder.workFlow_BufferKernel_ = workFlow_BufferKernel_;
+    worder.workFlow_use_BufferedKernel_ = workFlow_use_BufferedKernel_;
+    worder.num_channels_res_ = num_channels_res_;
+
+    worder.upstream_coil_compression_ = upstream_coil_compression_;
+    worder.upstream_coil_compression_thres_ = upstream_coil_compression_thres_;
+    worder.upstream_coil_compression_num_modesKept_ = upstream_coil_compression_num_modesKept_;
+
+    worder.downstream_coil_compression_ = downstream_coil_compression_;
+    worder.coil_compression_thres_ = coil_compression_thres_;
+    worder.coil_compression_num_modesKept_ = coil_compression_num_modesKept_;
+
+    worder.coil_map_algorithm_ = coil_map_algorithm_;
+    worder.csm_kSize_ = csm_kSize_;
+    worder.csm_powermethod_num_ = csm_powermethod_num_;
+    worder.csm_true_3D_ = csm_true_3D_;
+    worder.csm_iter_num_ = csm_iter_num_;
+    worder.csm_iter_thres_ = csm_iter_thres_;
+    worder.csm_use_gpu_ = csm_use_gpu_;
+
+    worder.start_RO_ = start_RO_;
+    worder.end_RO_ = end_RO_;
+
+    worder.start_E1_ = start_E1_;
+    worder.end_E1_ = end_E1_;
+
+    worder.start_E2_ = start_E2_;
+    worder.end_E2_ = end_E2_;
+
+    worder.recon_algorithm_ = recon_algorithm_;
+    worder.recon_auto_parameters_ = recon_auto_parameters_;
+
+    worder.grappa_kSize_RO_ = grappa_kSize_RO_;
+    worder.grappa_kSize_RO_ = grappa_kSize_RO_;
+    worder.grappa_kSize_E1_ = grappa_kSize_E1_;
+    worder.grappa_kSize_E2_ = grappa_kSize_E2_;
+    worder.grappa_reg_lamda_ = grappa_reg_lamda_;
+    worder.grappa_calib_over_determine_ratio_ = grappa_calib_over_determine_ratio_;
+    worder.grappa_use_gpu_ = grappa_use_gpu_;
+
+    worder.spirit_kSize_RO_ = spirit_kSize_RO_;
+    worder.spirit_kSize_E1_ = spirit_kSize_E1_;
+    worder.spirit_kSize_E2_ = spirit_kSize_E2_;
+    worder.spirit_reg_lamda_ = spirit_reg_lamda_;
+    worder.spirit_use_gpu_ = spirit_use_gpu_;
+    worder.spirit_calib_over_determine_ratio_ = spirit_calib_over_determine_ratio_;
+    worder.spirit_solve_symmetric_ = spirit_solve_symmetric_;
+    worder.spirit_iter_max_ = spirit_iter_max_;
+    worder.spirit_iter_thres_ = spirit_iter_thres_;
+    worder.spirit_print_iter_ = spirit_print_iter_;
+
+    worder.spirit_perform_linear_ = spirit_perform_linear_;
+    worder.spirit_perform_nonlinear_ = spirit_perform_nonlinear_;
+    worder.spirit_parallel_imaging_lamda_ = spirit_parallel_imaging_lamda_;
+    worder.spirit_image_reg_lamda_ = spirit_image_reg_lamda_;
+    worder.spirit_data_fidelity_lamda_ = spirit_data_fidelity_lamda_;
+    worder.spirit_ncg_iter_max_ = spirit_ncg_iter_max_;
+    worder.spirit_ncg_iter_thres_ = spirit_ncg_iter_thres_;
+    worder.spirit_ncg_scale_factor_ = spirit_ncg_scale_factor_;
+    worder.spirit_ncg_print_iter_ = spirit_ncg_print_iter_;
+    worder.spirit_use_coil_sen_map_ = spirit_use_coil_sen_map_;
+    worder.spirit_use_moco_enhancement_ = spirit_use_moco_enhancement_;
+    worder.spirit_recon_moco_images_ = spirit_recon_moco_images_;
+    worder.spirit_RO_enhancement_ratio_ = spirit_RO_enhancement_ratio_;
+    worder.spirit_E1_enhancement_ratio_ = spirit_E1_enhancement_ratio_;
+    worder.spirit_E2_enhancement_ratio_ = spirit_E2_enhancement_ratio_;
+    worder.spirit_temporal_enhancement_ratio_ = spirit_temporal_enhancement_ratio_;
+    worder.spirit_2D_scale_per_chunk_ = spirit_2D_scale_per_chunk_;
+    worder.spirit_3D_scale_per_chunk_ = spirit_3D_scale_per_chunk_;
+
+    worder.job_split_by_S_ = job_split_by_S_;
+    worder.job_num_of_N_ = job_num_of_N_;
+    worder.job_max_Megabytes_ = job_max_Megabytes_;
+    worder.job_overlap_ = job_overlap_;
+    worder.job_perform_on_control_node_ = job_perform_on_control_node_;
+
+    worder.partialFourier_algo_ = partialFourier_algo_;
+
+    worder.partialFourier_homodyne_iters_ = partialFourier_homodyne_iters_;
+    worder.partialFourier_homodyne_thres_ = partialFourier_homodyne_thres_;
+    worder.partialFourier_homodyne_densityComp_ = partialFourier_homodyne_densityComp_;
+
+    worder.partialFourier_POCS_iters_ = partialFourier_POCS_iters_;
+    worder.partialFourier_POCS_thres_ = partialFourier_POCS_thres_;
+    worder.partialFourier_POCS_transitBand_ = partialFourier_POCS_transitBand_;
+    worder.partialFourier_POCS_transitBand_E2_ = partialFourier_POCS_transitBand_E2_;
+
+    worder.partialFourier_FengHuang_kSize_RO_ = partialFourier_FengHuang_kSize_RO_;
+    worder.partialFourier_FengHuang_kSize_E1_ = partialFourier_FengHuang_kSize_E1_;
+    worder.partialFourier_FengHuang_kSize_E2_ = partialFourier_FengHuang_kSize_E2_;
+    worder.partialFourier_FengHuang_thresReg_ = partialFourier_FengHuang_thresReg_;
+    worder.partialFourier_FengHuang_sameKernel_allN_ = partialFourier_FengHuang_sameKernel_allN_;
+    worder.partialFourier_FengHuang_transitBand_ = partialFourier_FengHuang_transitBand_;
+    worder.partialFourier_FengHuang_transitBand_E2_ = partialFourier_FengHuang_transitBand_E2_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::duplicate(gtPlusReconWorkOrder<T>& worder) const
+{
+    this->duplicatePara(worder);
+
+    worder.dataDimStartingIndexes_ = dataDimStartingIndexes_;
+
+    worder.filterRO_ = filterRO_;
+    worder.filterE1_ = filterE1_;
+    worder.filterE2_ = filterE2_;
+    worder.filterROE1_ = filterROE1_;
+    worder.filterROE1E2_ = filterROE1E2_;
+
+    worder.filterRO_ref_ = filterRO_ref_;
+    worder.filterE1_ref_ = filterE1_ref_;
+    worder.filterE2_ref_ = filterE2_ref_;
+    worder.filterROE1_ref_ = filterROE1_ref_;
+    worder.filterROE1E2_ref_ = filterROE1E2_ref_;
+
+    worder.filterRO_partialfourier_ = filterRO_partialfourier_;
+    worder.filterE1_partialfourier_ = filterE1_partialfourier_;
+    worder.filterE2_partialfourier_ = filterE2_partialfourier_;
+    worder.filterROE1_partialfourier_ = filterROE1_partialfourier_;
+    worder.filterROE1E2_partialfourier_ = filterROE1E2_partialfourier_;
+
+    worder.CloudComputing_ = CloudComputing_;
+    worder.CloudSize_ = CloudSize_;
+    worder.gt_cloud_ = gt_cloud_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::copyFromPara(const gtPlusReconWorkOrderPara& worder)
+{
+    CalibMode_ = worder.CalibMode_;
+    InterleaveDim_ = worder.InterleaveDim_;
+
+    acceFactorE1_ = worder.acceFactorE1_;
+    acceFactorE2_ = worder.acceFactorE2_;
+
+    kSpaceCenterRO_ = worder.kSpaceCenterRO_;
+    kSpaceCenterEncode1_ = worder.kSpaceCenterEncode1_;
+    kSpaceCenterEncode2_ = worder.kSpaceCenterEncode2_;
+
+    kSpaceMaxRO_ = worder.kSpaceMaxRO_;
+    kSpaceMaxEncode1_ = worder.kSpaceMaxEncode1_;
+    kSpaceMaxEncode2_ = worder.kSpaceMaxEncode2_;
+
+    workFlow_BufferKernel_ = worder.workFlow_BufferKernel_;
+    workFlow_use_BufferedKernel_ = worder.workFlow_use_BufferedKernel_;
+    num_channels_res_ = worder.num_channels_res_;
+
+    upstream_coil_compression_ = worder.upstream_coil_compression_;
+    upstream_coil_compression_thres_ = worder.upstream_coil_compression_thres_;
+    upstream_coil_compression_num_modesKept_ = worder.upstream_coil_compression_num_modesKept_;
+
+    downstream_coil_compression_ = worder.downstream_coil_compression_;
+    coil_compression_thres_ = worder.coil_compression_thres_;
+    coil_compression_num_modesKept_ = worder.coil_compression_num_modesKept_;
+
+    coil_map_algorithm_ = worder.coil_map_algorithm_;
+    csm_kSize_ = worder.csm_kSize_;
+    csm_powermethod_num_ = worder.csm_powermethod_num_;
+    csm_true_3D_ = worder.csm_true_3D_;
+    csm_iter_num_ = worder.csm_iter_num_;
+    csm_iter_thres_ = worder.csm_iter_thres_;
+    csm_use_gpu_ = worder.csm_use_gpu_;
+
+    start_RO_ = worder.start_RO_;
+    end_RO_ = worder.end_RO_;
+
+    start_E1_ = worder.start_E1_;
+    end_E1_ = worder.end_E1_;
+
+    start_E2_ = worder.start_E2_;
+    end_E2_ = worder.end_E2_;
+
+    recon_algorithm_ = worder.recon_algorithm_;
+    recon_auto_parameters_ = worder.recon_auto_parameters_;
+
+    grappa_kSize_RO_ = worder.grappa_kSize_RO_;
+    grappa_kSize_RO_ = worder.grappa_kSize_RO_;
+    grappa_kSize_E1_ = worder.grappa_kSize_E1_;
+    grappa_kSize_E2_ = worder.grappa_kSize_E2_;
+    grappa_reg_lamda_ = worder.grappa_reg_lamda_;
+    grappa_calib_over_determine_ratio_ = worder.grappa_calib_over_determine_ratio_;
+    grappa_use_gpu_ = worder.grappa_use_gpu_;
+
+    spirit_kSize_RO_ = worder.spirit_kSize_RO_;
+    spirit_kSize_E1_ = worder.spirit_kSize_E1_;
+    spirit_kSize_E2_ = worder.spirit_kSize_E2_;
+    spirit_reg_lamda_ = worder.spirit_reg_lamda_;
+    spirit_use_gpu_ = worder.spirit_use_gpu_;
+    spirit_calib_over_determine_ratio_ = worder.spirit_calib_over_determine_ratio_;
+    spirit_solve_symmetric_ = worder.spirit_solve_symmetric_;
+    spirit_iter_max_ = worder.spirit_iter_max_;
+    spirit_iter_thres_ = worder.spirit_iter_thres_;
+    spirit_print_iter_ = worder.spirit_print_iter_;
+
+    spirit_perform_linear_ = worder.spirit_perform_linear_;
+    spirit_perform_nonlinear_ = worder.spirit_perform_nonlinear_;
+    spirit_parallel_imaging_lamda_ = worder.spirit_parallel_imaging_lamda_;
+    spirit_image_reg_lamda_ = worder.spirit_image_reg_lamda_;
+    spirit_data_fidelity_lamda_ = worder.spirit_data_fidelity_lamda_;
+    spirit_ncg_iter_max_ = worder.spirit_ncg_iter_max_;
+    spirit_ncg_iter_thres_ = worder.spirit_ncg_iter_thres_;
+    spirit_ncg_scale_factor_ = worder.spirit_ncg_scale_factor_;
+    spirit_ncg_print_iter_ = worder.spirit_ncg_print_iter_;
+    spirit_use_coil_sen_map_ = worder.spirit_use_coil_sen_map_;
+    spirit_use_moco_enhancement_ = worder.spirit_use_moco_enhancement_;
+    spirit_recon_moco_images_ = worder.spirit_recon_moco_images_;
+    spirit_RO_enhancement_ratio_ = worder.spirit_RO_enhancement_ratio_;
+    spirit_E1_enhancement_ratio_ = worder.spirit_E1_enhancement_ratio_;
+    spirit_E2_enhancement_ratio_ = worder.spirit_E2_enhancement_ratio_;
+    spirit_temporal_enhancement_ratio_ = worder.spirit_temporal_enhancement_ratio_;
+    spirit_2D_scale_per_chunk_ = worder.spirit_2D_scale_per_chunk_;
+    spirit_3D_scale_per_chunk_ = worder.spirit_3D_scale_per_chunk_;
+
+    job_split_by_S_ = worder.job_split_by_S_;
+    job_num_of_N_ = worder.job_num_of_N_;
+    job_max_Megabytes_ = worder.job_max_Megabytes_;
+    job_overlap_ = worder.job_overlap_;
+    job_perform_on_control_node_ = worder.job_perform_on_control_node_;
+
+    partialFourier_algo_ = worder.partialFourier_algo_;
+
+    partialFourier_homodyne_iters_ = worder.partialFourier_homodyne_iters_;
+    partialFourier_homodyne_thres_ = worder.partialFourier_homodyne_thres_;
+    partialFourier_homodyne_densityComp_ = worder.partialFourier_homodyne_densityComp_;
+
+    partialFourier_POCS_iters_ = worder.partialFourier_POCS_iters_;
+    partialFourier_POCS_thres_ = worder.partialFourier_POCS_thres_;
+    partialFourier_POCS_transitBand_ = worder.partialFourier_POCS_transitBand_;
+    partialFourier_POCS_transitBand_E2_ = worder.partialFourier_POCS_transitBand_E2_;
+
+    partialFourier_FengHuang_kSize_RO_ = worder.partialFourier_FengHuang_kSize_RO_;
+    partialFourier_FengHuang_kSize_E1_ = worder.partialFourier_FengHuang_kSize_E1_;
+    partialFourier_FengHuang_kSize_E2_ = worder.partialFourier_FengHuang_kSize_E2_;
+    partialFourier_FengHuang_thresReg_ = worder.partialFourier_FengHuang_thresReg_;
+    partialFourier_FengHuang_sameKernel_allN_ = worder.partialFourier_FengHuang_sameKernel_allN_;
+    partialFourier_FengHuang_transitBand_ = worder.partialFourier_FengHuang_transitBand_;
+    partialFourier_FengHuang_transitBand_E2_ = worder.partialFourier_FengHuang_transitBand_E2_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    GADGET_OSTREAM_PRINT(os, CalibMode_);
+    GADGET_OSTREAM_PRINT(os, InterleaveDim_);
+    GADGET_OSTREAM_PRINT(os, acceFactorE1_);
+    GADGET_OSTREAM_PRINT(os, acceFactorE2_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, kSpaceCenterRO_);
+    GADGET_OSTREAM_PRINT(os, kSpaceCenterEncode1_);
+    GADGET_OSTREAM_PRINT(os, kSpaceCenterEncode2_);
+    GADGET_OSTREAM_PRINT(os, kSpaceMaxRO_);
+    GADGET_OSTREAM_PRINT(os, kSpaceMaxEncode1_);
+    GADGET_OSTREAM_PRINT(os, kSpaceMaxEncode2_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, workFlow_BufferKernel_);
+    GADGET_OSTREAM_PRINT(os, workFlow_use_BufferedKernel_);
+    GADGET_OSTREAM_PRINT(os, num_channels_res_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, upstream_coil_compression_);
+    GADGET_OSTREAM_PRINT(os, upstream_coil_compression_thres_);
+    GADGET_OSTREAM_PRINT(os, upstream_coil_compression_num_modesKept_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, downstream_coil_compression_);
+    GADGET_OSTREAM_PRINT(os, coil_compression_thres_);
+    GADGET_OSTREAM_PRINT(os, coil_compression_num_modesKept_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, coil_map_algorithm_);
+    GADGET_OSTREAM_PRINT(os, csm_kSize_);
+    GADGET_OSTREAM_PRINT(os, csm_powermethod_num_);
+    GADGET_OSTREAM_PRINT(os, csm_true_3D_);
+    GADGET_OSTREAM_PRINT(os, csm_iter_num_);
+    GADGET_OSTREAM_PRINT(os, csm_iter_thres_);
+    GADGET_OSTREAM_PRINT(os, csm_use_gpu_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, start_RO_);
+    GADGET_OSTREAM_PRINT(os, end_RO_);
+    GADGET_OSTREAM_PRINT(os, start_E1_);
+    GADGET_OSTREAM_PRINT(os, end_E1_);
+    GADGET_OSTREAM_PRINT(os, start_E2_);
+    GADGET_OSTREAM_PRINT(os, end_E2_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, recon_algorithm_);
+    GADGET_OSTREAM_PRINT(os, recon_auto_parameters_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, grappa_kSize_RO_);
+    GADGET_OSTREAM_PRINT(os, grappa_kSize_E1_);
+    GADGET_OSTREAM_PRINT(os, grappa_kSize_E2_);
+    GADGET_OSTREAM_PRINT(os, grappa_reg_lamda_);
+    GADGET_OSTREAM_PRINT(os, grappa_calib_over_determine_ratio_);
+    GADGET_OSTREAM_PRINT(os, grappa_use_gpu_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, spirit_kSize_RO_);
+    GADGET_OSTREAM_PRINT(os, spirit_kSize_E1_);
+    GADGET_OSTREAM_PRINT(os, spirit_kSize_E2_);
+    GADGET_OSTREAM_PRINT(os, spirit_reg_lamda_);
+    GADGET_OSTREAM_PRINT(os, spirit_use_gpu_);
+    GADGET_OSTREAM_PRINT(os, spirit_calib_over_determine_ratio_);
+    GADGET_OSTREAM_PRINT(os, spirit_solve_symmetric_);
+    GADGET_OSTREAM_PRINT(os, spirit_iter_max_);
+    GADGET_OSTREAM_PRINT(os, spirit_iter_thres_);
+    GADGET_OSTREAM_PRINT(os, spirit_print_iter_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, spirit_perform_linear_);
+    GADGET_OSTREAM_PRINT(os, spirit_perform_nonlinear_);
+    GADGET_OSTREAM_PRINT(os, spirit_parallel_imaging_lamda_);
+    GADGET_OSTREAM_PRINT(os, spirit_image_reg_lamda_);
+    GADGET_OSTREAM_PRINT(os, spirit_data_fidelity_lamda_);
+    GADGET_OSTREAM_PRINT(os, spirit_ncg_iter_max_);
+    GADGET_OSTREAM_PRINT(os, spirit_ncg_iter_thres_);
+    GADGET_OSTREAM_PRINT(os, spirit_ncg_scale_factor_);
+    GADGET_OSTREAM_PRINT(os, spirit_ncg_print_iter_);
+    GADGET_OSTREAM_PRINT(os, spirit_use_coil_sen_map_);
+    GADGET_OSTREAM_PRINT(os, spirit_use_moco_enhancement_);
+    GADGET_OSTREAM_PRINT(os, spirit_recon_moco_images_);
+    GADGET_OSTREAM_PRINT(os, spirit_RO_enhancement_ratio_);
+    GADGET_OSTREAM_PRINT(os, spirit_E1_enhancement_ratio_);
+    GADGET_OSTREAM_PRINT(os, spirit_E2_enhancement_ratio_);
+    GADGET_OSTREAM_PRINT(os, spirit_temporal_enhancement_ratio_);
+    GADGET_OSTREAM_PRINT(os, spirit_2D_scale_per_chunk_);
+    GADGET_OSTREAM_PRINT(os, spirit_3D_scale_per_chunk_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, job_split_by_S_);
+    GADGET_OSTREAM_PRINT(os, job_num_of_N_);
+    GADGET_OSTREAM_PRINT(os, job_max_Megabytes_);
+    GADGET_OSTREAM_PRINT(os, job_overlap_);
+    GADGET_OSTREAM_PRINT(os, job_perform_on_control_node_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, partialFourier_algo_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_homodyne_iters_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_homodyne_thres_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_homodyne_densityComp_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_POCS_iters_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_POCS_thres_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_POCS_transitBand_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_POCS_transitBand_E2_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_kSize_RO_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_kSize_E1_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_kSize_E2_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_thresReg_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_sameKernel_allN_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_transitBand_);
+    GADGET_OSTREAM_PRINT(os, partialFourier_FengHuang_transitBand_E2_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, CloudComputing_);
+    GADGET_OSTREAM_PRINT(os, CloudSize_);
+    for ( unsigned int nn=0; nn<gt_cloud_.size(); nn++ )
+    {
+        GADGET_OSTREAM_PRINT(os, gt_cloud_[nn]);
+    }
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- gtPlusReconWorkOrder ---------------" << endl;
+    printInfo(os);
+    os << "---------------------------------------------------" << endl;
+}
+
+}}
+
+#include "gtPlusISMRMRDReconWorkOrder2DT.h"
+#include "gtPlusISMRMRDReconWorkOrder3DT.h"
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder2DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder2DT.h
new file mode 100644
index 0000000..89e0cea
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder2DT.h
@@ -0,0 +1,382 @@
+/** \file   gtPlusISMRMRDReconWorkOrder2DT.h
+    \brief  Define the GtPlus reconstruction workorder and parameters for 2DT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorkOrder2DT : public gtPlusReconWorkOrder<T>
+{
+public:
+
+    typedef gtPlusReconWorkOrder<T> BaseClass;
+
+    gtPlusReconWorkOrder2DT();
+    virtual ~gtPlusReconWorkOrder2DT();
+
+    virtual bool reset();
+
+    virtual bool enforceConsistency(ISMRMRDDIM& lastDim);
+    virtual void duplicate(gtPlusReconWorkOrder2DT<T>& worder);
+
+    virtual void printInfo(std::ostream& os) const;
+    virtual void print(std::ostream& os) const;
+
+    // kspace_: [RO E1 CHA N S], for 2D recon, N can be 1
+    // ref_: [RO E1 CHA M S], M can equal to N or 1 or others
+    // fullkspace_: [RO E1 CHA N S]
+    // complexIm_: [RO E1 N S], after coil combination or [RO E1 num_channels_res_ N S] if num_channels_res_ > 1
+    // coilMap_: [RO E1 CHA 1 or N S]
+    // gfactor_: [RO E1 CHA 1 or N S]
+
+    // the fifth dimension can be SLC or SET or others
+
+    // default behavior
+    // a) the coil compression coefficients are computed once across all S
+    // b) the kernel or coil sensitivity are estimated for every S
+
+    // embedded mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA]
+    // b) coil combination uses different coil maps for every S
+    // c) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+    // d) the ref lines are filled back to fullkspace_
+
+    // separate mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 2D kspace [RO E1 CHA] if M==N
+    // b) if M==1, the kernel is only estimated once for every S
+    // c) coil combination uses different coil maps for every S
+    // d) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 2D images
+
+    // interleave
+    // a) the average-all ref is used
+    // b) kernel/coil sensitivity is estimated once for every S
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::ref_recon_;
+    using BaseClass::ref_coil_map_;
+    using BaseClass::CalibMode_;
+    using BaseClass::InterleaveDim_;
+    using BaseClass::acceFactorE1_;
+    using BaseClass::acceFactorE2_;
+    using BaseClass::num_channels_res_;
+    using BaseClass::coilMap_; // [RO E1 dstCHA 1 or N S]
+    using BaseClass::fullkspace_; // [RO E1 dstCHA N S]
+    using BaseClass::complexIm_; // [RO E1 N S]
+    using BaseClass::gfactor_; // [RO E1 1 or N S]
+
+    using BaseClass::downstream_coil_compression_;
+    using BaseClass::coil_compression_thres_;
+    using BaseClass::coil_compression_num_modesKept_;
+    using BaseClass::csm_kSize_;
+    using BaseClass::csm_powermethod_num_;
+    using BaseClass::csm_true_3D_;
+    using BaseClass::csm_iter_num_;
+    using BaseClass::csm_iter_thres_;
+    using BaseClass::csm_use_gpu_;
+    using BaseClass::start_RO_;
+    using BaseClass::end_RO_;
+    using BaseClass::start_E1_;
+    using BaseClass::end_E1_;
+    using BaseClass::start_E2_;
+    using BaseClass::end_E2_;
+
+    using BaseClass::filterRO_;
+    using BaseClass::filterE1_;
+    using BaseClass::filterE2_;
+    using BaseClass::filterROE1_;
+    using BaseClass::filterROE1E2_;
+
+    using BaseClass::filterRO_ref_;
+    using BaseClass::filterE1_ref_;
+    using BaseClass::filterE2_ref_;
+    using BaseClass::filterROE1_ref_;
+    using BaseClass::filterROE1E2_ref_;
+
+    using BaseClass::filterRO_partialfourier_;
+    using BaseClass::filterE1_partialfourier_;
+    using BaseClass::filterE2_partialfourier_;
+    using BaseClass::filterROE1_partialfourier_;
+    using BaseClass::filterROE1E2_partialfourier_;
+
+    using BaseClass::recon_algorithm_;
+
+    using BaseClass::grappa_kSize_RO_;
+    using BaseClass::grappa_kSize_E1_;
+    using BaseClass::grappa_kSize_E2_;
+    using BaseClass::grappa_reg_lamda_;
+    using BaseClass::grappa_calib_over_determine_ratio_;
+    using BaseClass::grappa_use_gpu_;
+
+    using BaseClass::spirit_kSize_RO_;
+    using BaseClass::spirit_kSize_E1_;
+    using BaseClass::spirit_kSize_E2_;
+    using BaseClass::spirit_reg_lamda_;
+    using BaseClass::spirit_use_gpu_;
+    using BaseClass::spirit_iter_max_;
+    using BaseClass::spirit_iter_thres_;
+    using BaseClass::spirit_print_iter_;
+
+    using BaseClass::spirit_perform_linear_;
+    using BaseClass::spirit_perform_nonlinear_;
+    using BaseClass::spirit_parallel_imaging_lamda_;
+    using BaseClass::spirit_image_reg_lamda_;
+    using BaseClass::spirit_data_fidelity_lamda_;
+    using BaseClass::spirit_ncg_iter_max_;
+    using BaseClass::spirit_ncg_iter_thres_;
+    using BaseClass::spirit_ncg_scale_factor_;
+    using BaseClass::spirit_ncg_print_iter_;
+    using BaseClass::spirit_use_coil_sen_map_;
+    using BaseClass::spirit_use_moco_enhancement_;
+    using BaseClass::spirit_recon_moco_images_;
+    using BaseClass::spirit_RO_enhancement_ratio_;
+    using BaseClass::spirit_E1_enhancement_ratio_;
+    using BaseClass::spirit_E2_enhancement_ratio_;
+    using BaseClass::spirit_temporal_enhancement_ratio_;
+
+    using BaseClass::job_split_by_S_;
+    using BaseClass::job_num_of_N_;
+    using BaseClass::job_max_Megabytes_;
+    using BaseClass::job_overlap_;
+
+    using BaseClass::partialFourier_algo_;
+    using BaseClass::partialFourier_homodyne_iters_;
+    using BaseClass::partialFourier_homodyne_thres_;
+    using BaseClass::partialFourier_homodyne_densityComp_;
+    using BaseClass::partialFourier_POCS_iters_;
+    using BaseClass::partialFourier_POCS_thres_;
+    using BaseClass::partialFourier_POCS_transitBand_;
+    using BaseClass::partialFourier_FengHuang_kSize_RO_;
+    using BaseClass::partialFourier_FengHuang_kSize_E1_;
+    using BaseClass::partialFourier_FengHuang_kSize_E2_;
+    using BaseClass::partialFourier_FengHuang_thresReg_;
+    using BaseClass::partialFourier_FengHuang_sameKernel_allN_;
+    using BaseClass::partialFourier_FengHuang_transitBand_;
+
+    using BaseClass::CloudComputing_;
+    using BaseClass::CloudSize_;
+    using BaseClass::gt_cloud_;
+
+    // for 2DT
+    using BaseClass::kernel_; // [RO E1 srcCHA dstCHA dstE1 1 or N S]
+    using BaseClass::kernelIm_; // [RO E1 srcCHA dstCHA 1 or N S]
+    using BaseClass::unmixingCoeffIm_; // [RO E1 srcCHA 1 or N S]
+    using BaseClass::coilCompressionCoef_; // [dstCHA srcCHA] matrixes
+
+    // parameters to change the default behavior
+
+    // if true, the actual full kspace is computed, not only the coil combined complex images
+    bool recon_kspace_needed_;
+
+    // if true, no coil compression will be performed
+    bool coil_compression_;
+    // if true, the same coil compression coefficient is computed for all S
+    bool same_coil_compression_coeff_allS_;
+
+    // no acceleration
+    // if true, the average of all M ref will be used
+    // the coil sensitivity will be only estimed once for all N
+    bool no_acceleration_averageall_ref_;
+    // number of modes kept for ref data
+    int no_acceleration_ref_numOfModes_;
+    // if true, the same coil combination coefficients will be used for all S
+    bool no_acceleration_same_combinationcoeff_allS_;
+    // if no_acceleration_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t no_acceleration_whichS_combinationcoeff_;
+
+    // embedded mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimed once for all N
+    bool embedded_averageall_ref_;
+    // number of modes kept for ref data
+    int embedded_ref_numOfModes_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool embedded_fullres_coilmap_;
+    // if embedded_averageall_ref_==true && embedded_fullres_coilmap_==true, whether to select the highest signal frame to compute full res coil map
+    // if false, the averageall image will be used to compute full res coil map
+    bool embedded_fullres_coilmap_useHighestSignal_;
+    // if true, the same coil combination coefficients will be used for all S
+    bool embedded_same_combinationcoeff_allS_;
+    // if embedded_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t embedded_whichS_combinationcoeff_;
+    // if true, the ref lines will be filled back to fullkspace
+    bool embedded_ref_fillback_;
+
+    // separate mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimed once for every S
+    bool separate_averageall_ref_;
+    // number of modes kept for ref data
+    int separate_ref_numOfModes_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool separate_fullres_coilmap_;
+    // if true, the same coil combination coefficients will be used for all S
+    bool separate_same_combinationcoeff_allS_;
+    // if separate_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t separate_whichS_combinationcoeff_;
+
+    // interleaved mode
+    // if true, the same coil combination coefficients will be used for all S
+    bool interleaved_same_combinationcoeff_allS_;
+    // if separate_same_combinationcoeff_allS_==true, select the S for coil combination coefficient estimation
+    size_t interleaved_whichS_combinationcoeff_;
+    // number of modes kept for ref data
+    int interleaved_ref_numOfModes_;
+};
+
+template <typename T> 
+gtPlusReconWorkOrder2DT<T>::gtPlusReconWorkOrder2DT() : BaseClass()
+{
+    coil_compression_ = true;
+    same_coil_compression_coeff_allS_ = false;
+
+    no_acceleration_averageall_ref_ = true;
+    no_acceleration_ref_numOfModes_ = 3;
+    no_acceleration_same_combinationcoeff_allS_ = false;
+    no_acceleration_whichS_combinationcoeff_ = 0;
+
+    embedded_averageall_ref_ = false;
+    embedded_ref_numOfModes_ = 3;
+    embedded_fullres_coilmap_ = true;
+    embedded_fullres_coilmap_useHighestSignal_ = false;
+    embedded_same_combinationcoeff_allS_ = false;
+    embedded_whichS_combinationcoeff_ = false;
+    embedded_ref_fillback_ = true;
+
+    separate_averageall_ref_ = false;
+    separate_ref_numOfModes_ = 3;
+    separate_fullres_coilmap_ = true;
+    separate_same_combinationcoeff_allS_ = false;
+    separate_whichS_combinationcoeff_ = false;
+
+    interleaved_same_combinationcoeff_allS_ = false;
+    interleaved_whichS_combinationcoeff_ = false;
+    interleaved_ref_numOfModes_ = 0;
+}
+
+template <typename T> 
+gtPlusReconWorkOrder2DT<T>::~gtPlusReconWorkOrder2DT()
+{
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder2DT<T>::reset()
+{
+    try
+    {
+        kernel_->clear();
+        kernelIm_->clear();
+        unmixingCoeffIm_->clear();
+        coilCompressionCoef_->clear();
+        coilMap_->clear();
+
+        fullkspace_.clear();
+        complexIm_.clear();
+        gfactor_.clear();
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorkOrder2DT<T>::reset() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder2DT<T>::enforceConsistency(ISMRMRDDIM& lastDim)
+{
+    if ( lastDim == DIM_Slice )
+    {
+        same_coil_compression_coeff_allS_ = false;
+        no_acceleration_same_combinationcoeff_allS_ = false;
+        embedded_same_combinationcoeff_allS_ = false;
+        separate_same_combinationcoeff_allS_ = false;
+        interleaved_same_combinationcoeff_allS_ = false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder2DT<T>::duplicate(gtPlusReconWorkOrder2DT<T>& worder)
+{
+    BaseClass::duplicate(worder);
+
+    worder.recon_kspace_needed_ = recon_kspace_needed_;
+
+    worder.coil_compression_ = coil_compression_;
+    worder.same_coil_compression_coeff_allS_ = same_coil_compression_coeff_allS_;
+
+    worder.no_acceleration_averageall_ref_ = no_acceleration_averageall_ref_;
+    worder.no_acceleration_ref_numOfModes_ = no_acceleration_ref_numOfModes_;
+    worder.no_acceleration_same_combinationcoeff_allS_ = no_acceleration_same_combinationcoeff_allS_;
+    worder.no_acceleration_whichS_combinationcoeff_ = no_acceleration_whichS_combinationcoeff_;
+
+    worder.embedded_averageall_ref_ = embedded_averageall_ref_;
+    worder.embedded_ref_numOfModes_ = embedded_ref_numOfModes_;
+    worder.embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    worder.embedded_fullres_coilmap_useHighestSignal_ = embedded_fullres_coilmap_useHighestSignal_;
+    worder.embedded_same_combinationcoeff_allS_ = embedded_same_combinationcoeff_allS_;
+    worder.embedded_whichS_combinationcoeff_ = embedded_whichS_combinationcoeff_;
+    worder.embedded_ref_fillback_ = embedded_ref_fillback_;
+
+    worder.separate_averageall_ref_ = separate_averageall_ref_;
+    worder.separate_ref_numOfModes_ = separate_ref_numOfModes_;
+    worder.separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    worder.separate_same_combinationcoeff_allS_ = separate_same_combinationcoeff_allS_;
+    worder.separate_whichS_combinationcoeff_ = separate_whichS_combinationcoeff_;
+
+    worder.interleaved_same_combinationcoeff_allS_ = interleaved_same_combinationcoeff_allS_;
+    worder.interleaved_whichS_combinationcoeff_ = interleaved_whichS_combinationcoeff_;
+    worder.interleaved_ref_numOfModes_ = interleaved_ref_numOfModes_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder2DT<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    BaseClass::printInfo(os);
+
+    GADGET_OSTREAM_PRINT(os, recon_kspace_needed_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, coil_compression_);
+    GADGET_OSTREAM_PRINT(os, same_coil_compression_coeff_allS_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, no_acceleration_averageall_ref_);
+    GADGET_OSTREAM_PRINT(os, no_acceleration_ref_numOfModes_);
+    GADGET_OSTREAM_PRINT(os, no_acceleration_same_combinationcoeff_allS_);
+    GADGET_OSTREAM_PRINT(os, no_acceleration_whichS_combinationcoeff_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, embedded_averageall_ref_);
+    GADGET_OSTREAM_PRINT(os, embedded_ref_numOfModes_);
+    GADGET_OSTREAM_PRINT(os, embedded_fullres_coilmap_);
+    GADGET_OSTREAM_PRINT(os, embedded_fullres_coilmap_useHighestSignal_);
+    GADGET_OSTREAM_PRINT(os, embedded_same_combinationcoeff_allS_);
+    GADGET_OSTREAM_PRINT(os, embedded_whichS_combinationcoeff_);
+    GADGET_OSTREAM_PRINT(os, embedded_ref_fillback_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, separate_averageall_ref_);
+    GADGET_OSTREAM_PRINT(os, separate_ref_numOfModes_);
+    GADGET_OSTREAM_PRINT(os, separate_fullres_coilmap_);
+    GADGET_OSTREAM_PRINT(os, separate_same_combinationcoeff_allS_);
+    GADGET_OSTREAM_PRINT(os, separate_whichS_combinationcoeff_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, interleaved_same_combinationcoeff_allS_);
+    GADGET_OSTREAM_PRINT(os, interleaved_whichS_combinationcoeff_);
+    GADGET_OSTREAM_PRINT(os, interleaved_ref_numOfModes_);
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder2DT<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- gtPlusReconWorkOrder2DT ---------------" << endl;
+    printInfo(os);
+    os << "------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder3DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder3DT.h
new file mode 100644
index 0000000..9398cd5
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorkOrder3DT.h
@@ -0,0 +1,355 @@
+/** \file   gtPlusISMRMRDReconWorkOrder3DT.h
+    \brief  Define the GtPlus reconstruction workorder and parameters for 3DT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorkOrder3DT : public gtPlusReconWorkOrder<T>
+{
+public:
+
+    typedef gtPlusReconWorkOrder<T> BaseClass;
+
+    gtPlusReconWorkOrder3DT();
+    virtual ~gtPlusReconWorkOrder3DT();
+
+    // reset the status of work order
+    // all computed calibration/coil sensitivity results are deleted
+    virtual bool reset();
+
+    // check and modify inconsistency in the work order
+    virtual bool enforceConsistency(ISMRMRDDIM& lastDim);
+
+    virtual void duplicate(gtPlusReconWorkOrder3DT<T>& worder);
+
+    virtual void printInfo(std::ostream& os) const;
+    virtual void print(std::ostream& os) const;
+
+    // kspace_: [RO E1 E2 CHA N], for 3D recon, N can be 1
+    // ref_: [RO E1 E2 CHA M], M can equal to N or 1 or others
+    // fullkspace_: [RO E1 E2 CHA N]
+    // complexIm_: [RO E1 E2 N], after coil combination
+    // coilMap_: [RO E1 E2 CHA 1 or N]
+    // gfactor_: [RO E1 E2 CHA 1 or N]
+
+    // the fifth dimension can be the temporal dimension or others
+
+    // default behavior
+    // a) the coil compression coefficients are computed once for all N
+
+    // embedded mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 3D kspace [RO E1 E2 CHA]
+    // b) coil combination uses different coil maps for every N
+    // c) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 3D images
+    // d) the ref lines are filled back to fullkspace_
+
+    // separate mode
+    // a) perform recon and estimate kernel/coil sensitivity for every 3D kspace [RO E1 E2 CHA] if M==N
+    // b) if M==1, the kernel is only estimated once for all N
+    // c) coil combination uses different coil maps for every N
+    // d) if the kspace recon is performed, the coil combination map is reestimated on the fullkspace for every 3D images
+
+    // interleave
+    // a) the average-all ref is used
+    // b) kernel/coil sensitivity is estimated once for all N
+
+    using BaseClass::data_;
+    using BaseClass::ref_;
+    using BaseClass::ref_recon_;
+    using BaseClass::ref_coil_map_;
+    using BaseClass::CalibMode_;
+    using BaseClass::InterleaveDim_;
+    using BaseClass::acceFactorE1_;
+    using BaseClass::acceFactorE2_;
+    using BaseClass::num_channels_res_;
+
+    using BaseClass::coilMap_;
+    using BaseClass::fullkspace_;
+    using BaseClass::complexIm_;
+    using BaseClass::gfactor_;
+
+    using BaseClass::upstream_coil_compression_;
+    using BaseClass::upstream_coil_compression_thres_;
+    using BaseClass::upstream_coil_compression_num_modesKept_;
+
+    using BaseClass::downstream_coil_compression_;
+    using BaseClass::coil_compression_thres_;
+    using BaseClass::coil_compression_num_modesKept_;
+    using BaseClass::csm_kSize_;
+    using BaseClass::csm_powermethod_num_;
+    using BaseClass::csm_true_3D_;
+    using BaseClass::csm_iter_num_;
+    using BaseClass::csm_iter_thres_;
+    using BaseClass::csm_use_gpu_;
+    using BaseClass::start_RO_;
+    using BaseClass::end_RO_;
+    using BaseClass::start_E1_;
+    using BaseClass::end_E1_;
+    using BaseClass::start_E2_;
+    using BaseClass::end_E2_;
+
+    using BaseClass::filterRO_;
+    using BaseClass::filterE1_;
+    using BaseClass::filterE2_;
+    using BaseClass::filterROE1_;
+    using BaseClass::filterROE1E2_;
+
+    using BaseClass::filterRO_ref_;
+    using BaseClass::filterE1_ref_;
+    using BaseClass::filterE2_ref_;
+    using BaseClass::filterROE1_ref_;
+    using BaseClass::filterROE1E2_ref_;
+
+    using BaseClass::filterRO_partialfourier_;
+    using BaseClass::filterE1_partialfourier_;
+    using BaseClass::filterE2_partialfourier_;
+    using BaseClass::filterROE1_partialfourier_;
+    using BaseClass::filterROE1E2_partialfourier_;
+
+    using BaseClass::recon_algorithm_;
+
+    using BaseClass::grappa_kSize_RO_;
+    using BaseClass::grappa_kSize_E1_;
+    using BaseClass::grappa_kSize_E2_;
+    using BaseClass::grappa_reg_lamda_;
+    using BaseClass::grappa_calib_over_determine_ratio_;
+    using BaseClass::grappa_use_gpu_;
+
+    using BaseClass::spirit_kSize_RO_;
+    using BaseClass::spirit_kSize_E1_;
+    using BaseClass::spirit_kSize_E2_;
+    using BaseClass::spirit_reg_lamda_;
+    using BaseClass::spirit_use_gpu_;
+    using BaseClass::spirit_iter_max_;
+    using BaseClass::spirit_iter_thres_;
+    using BaseClass::spirit_print_iter_;
+
+    using BaseClass::spirit_perform_linear_;
+    using BaseClass::spirit_perform_nonlinear_;
+    using BaseClass::spirit_parallel_imaging_lamda_;
+    using BaseClass::spirit_image_reg_lamda_;
+    using BaseClass::spirit_data_fidelity_lamda_;
+    using BaseClass::spirit_ncg_iter_max_;
+    using BaseClass::spirit_ncg_iter_thres_;
+    using BaseClass::spirit_ncg_scale_factor_;
+    using BaseClass::spirit_ncg_print_iter_;
+    using BaseClass::spirit_use_coil_sen_map_;
+    using BaseClass::spirit_use_moco_enhancement_;
+    using BaseClass::spirit_recon_moco_images_;
+    using BaseClass::spirit_RO_enhancement_ratio_;
+    using BaseClass::spirit_E1_enhancement_ratio_;
+    using BaseClass::spirit_E2_enhancement_ratio_;
+    using BaseClass::spirit_temporal_enhancement_ratio_;
+
+    using BaseClass::job_split_by_S_;
+    using BaseClass::job_num_of_N_;
+    using BaseClass::job_max_Megabytes_;
+    using BaseClass::job_overlap_;
+
+    using BaseClass::partialFourier_algo_;
+    using BaseClass::partialFourier_homodyne_iters_;
+    using BaseClass::partialFourier_homodyne_thres_;
+    using BaseClass::partialFourier_homodyne_densityComp_;
+    using BaseClass::partialFourier_POCS_iters_;
+    using BaseClass::partialFourier_POCS_thres_;
+    using BaseClass::partialFourier_POCS_transitBand_;
+    using BaseClass::partialFourier_FengHuang_kSize_RO_;
+    using BaseClass::partialFourier_FengHuang_kSize_E1_;
+    using BaseClass::partialFourier_FengHuang_kSize_E2_;
+    using BaseClass::partialFourier_FengHuang_thresReg_;
+    using BaseClass::partialFourier_FengHuang_sameKernel_allN_;
+    using BaseClass::partialFourier_FengHuang_transitBand_;
+    using BaseClass::partialFourier_FengHuang_transitBand_E2_;
+
+    using BaseClass::CloudComputing_;
+    using BaseClass::CloudSize_;
+    using BaseClass::gt_cloud_;
+
+    using BaseClass::kernel_; // [RO E1 E2 srcCHA dstCHA dstRO dstE1 dstE2]
+    using BaseClass::kernelIm_; // [RO E1 E2 srcCHA dstCHA]
+    using BaseClass::unmixingCoeffIm_; // [RO E1 E2 srcCHA 1 or N]
+    using BaseClass::coilCompressionCoef_;
+
+    // parameters to change the default behavior
+
+    // if true, the actual full kspace is computed, not only the coil combined complex images
+    bool recon_kspace_needed_;
+
+    // if true, no coil compression will be performed
+    bool coil_compression_;
+    // if true, the same coil compression coefficient is computed for all N
+    bool same_coil_compression_coeff_allN_;
+
+    // no acceleration
+    // if true, the average of all M ref will be used
+    // the coil sensitivity will be only estimated once for all N
+    bool no_acceleration_averageall_ref_;
+    // if true, the same coil combination coefficients will be used for all N
+    bool no_acceleration_same_combinationcoeff_allN_;
+    // if no_acceleration_same_combinationcoeff_allN_==true, select the N for coil combination coefficient estimation
+    size_t no_acceleration_whichN_combinationcoeff_;
+
+    // embedded mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimated once for all N
+    bool embedded_averageall_ref_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool embedded_fullres_coilmap_;
+    // if true, the same coil combination coefficients will be used for all N
+    bool embedded_same_combinationcoeff_allN_;
+    // if embedded_same_combinationcoeff_allN_==true, select the N for coil combination coefficient estimation
+    // if -1, the average-all N is used for coil combination
+    int embedded_whichN_combinationcoeff_;
+    // if true, the ref lines will be filled back to fullkspace
+    bool embedded_ref_fillback_;
+
+    // separate mode
+    // if true, the average of all M ref will be used
+    // the kernel/sensitivity will be only estimated once for all N
+    bool separate_averageall_ref_;
+    // if true, the coil map will be estimated from the fullkspace_
+    bool separate_fullres_coilmap_;
+    // if true, the same coil combination coefficients will be used for all N
+    bool separate_same_combinationcoeff_allN_;
+    // if separate_same_combinationcoeff_allN_==true, select the 3D kspace used for coil combination coefficient estimation
+    // if -1, the average-all N is used for coil combination
+    int separate_whichN_combinationcoeff_;
+
+    // interleaved mode
+};
+
+template <typename T> 
+gtPlusReconWorkOrder3DT<T>::gtPlusReconWorkOrder3DT() : BaseClass()
+{
+    recon_kspace_needed_ = false;
+    coil_compression_ = true;
+    same_coil_compression_coeff_allN_ = false;
+
+    no_acceleration_averageall_ref_ = false;
+    no_acceleration_same_combinationcoeff_allN_ = false;
+
+    embedded_averageall_ref_ = false;
+    embedded_fullres_coilmap_ = true;
+    embedded_same_combinationcoeff_allN_ = false;
+    embedded_whichN_combinationcoeff_ = false;
+    embedded_ref_fillback_ = true;
+
+    separate_averageall_ref_ = false;
+    separate_fullres_coilmap_ = true;
+    separate_same_combinationcoeff_allN_ = false;
+    separate_whichN_combinationcoeff_ = false;
+}
+
+template <typename T> 
+gtPlusReconWorkOrder3DT<T>::~gtPlusReconWorkOrder3DT()
+{
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder3DT<T>::reset()
+{
+    try
+    {
+        kernel_->clear();
+        kernelIm_->clear();
+        unmixingCoeffIm_->clear();
+        coilCompressionCoef_->clear();
+        coilMap_->clear();
+
+        fullkspace_.clear();
+        complexIm_.clear();
+        gfactor_.clear();
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorkOrder3DT<T>::reset() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorkOrder3DT<T>::enforceConsistency(ISMRMRDDIM& lastDim)
+{
+    if ( lastDim == DIM_Slice )
+    {
+        no_acceleration_averageall_ref_ = false;
+        no_acceleration_same_combinationcoeff_allN_ = false;
+
+        embedded_averageall_ref_ = false;
+        embedded_same_combinationcoeff_allN_ = false;
+
+        separate_averageall_ref_ = false;
+        separate_same_combinationcoeff_allN_ = false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder3DT<T>::duplicate(gtPlusReconWorkOrder3DT<T>& worder)
+{
+    BaseClass::duplicate(worder);
+
+    worder.recon_kspace_needed_ = recon_kspace_needed_;
+    worder.coil_compression_ = coil_compression_;
+    worder.same_coil_compression_coeff_allN_ = same_coil_compression_coeff_allN_;
+
+    worder.no_acceleration_averageall_ref_ = no_acceleration_averageall_ref_;
+    worder.no_acceleration_same_combinationcoeff_allN_ = no_acceleration_same_combinationcoeff_allN_;
+    worder.no_acceleration_whichN_combinationcoeff_ = no_acceleration_whichN_combinationcoeff_;
+
+    worder.embedded_averageall_ref_ = embedded_averageall_ref_;
+    worder.embedded_fullres_coilmap_ = embedded_fullres_coilmap_;
+    worder.embedded_same_combinationcoeff_allN_ = embedded_same_combinationcoeff_allN_;
+    worder.embedded_whichN_combinationcoeff_ = embedded_whichN_combinationcoeff_;
+    worder.embedded_ref_fillback_ = embedded_ref_fillback_;
+
+    worder.separate_averageall_ref_ = separate_averageall_ref_;
+    worder.separate_fullres_coilmap_ = separate_fullres_coilmap_;
+    worder.separate_same_combinationcoeff_allN_ = separate_same_combinationcoeff_allN_;
+    worder.separate_whichN_combinationcoeff_ = separate_whichN_combinationcoeff_;
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder3DT<T>::printInfo(std::ostream& os) const
+{
+    using namespace std;
+    BaseClass::printInfo(os);
+
+    GADGET_OSTREAM_PRINT(os, recon_kspace_needed_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, coil_compression_);
+    GADGET_OSTREAM_PRINT(os, same_coil_compression_coeff_allN_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, no_acceleration_averageall_ref_);
+    GADGET_OSTREAM_PRINT(os, no_acceleration_same_combinationcoeff_allN_);
+    GADGET_OSTREAM_PRINT(os, no_acceleration_whichN_combinationcoeff_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, embedded_averageall_ref_);
+    GADGET_OSTREAM_PRINT(os, embedded_fullres_coilmap_);
+    GADGET_OSTREAM_PRINT(os, embedded_same_combinationcoeff_allN_);
+    GADGET_OSTREAM_PRINT(os, embedded_whichN_combinationcoeff_);
+    GADGET_OSTREAM_PRINT(os, embedded_ref_fillback_);
+    os << std::endl;
+    GADGET_OSTREAM_PRINT(os, separate_averageall_ref_);
+    GADGET_OSTREAM_PRINT(os, separate_fullres_coilmap_);
+    GADGET_OSTREAM_PRINT(os, separate_same_combinationcoeff_allN_);
+    GADGET_OSTREAM_PRINT(os, separate_whichN_combinationcoeff_);
+}
+
+template <typename T> 
+void gtPlusReconWorkOrder3DT<T>::print(std::ostream& os) const
+{
+    using namespace std;
+    os << "-------------- gtPlusReconWorkOrder3DT ---------------" << endl;
+    printInfo(os);
+    os << "------------------------------------------------------" << endl;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker.h
new file mode 100644
index 0000000..cffd489
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker.h
@@ -0,0 +1,614 @@
+/** \file   gtPlusISMRMRDReconWorker.h
+    \brief  Define the base class for the GtPlus worker for reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+
+#include <string>
+#include "util/gtPlusIOAnalyze.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorkOrder.h"
+#include "gtPlusMemoryManager.h"
+#include "hoNDArrayMemoryManaged.h"
+#include "SerializableObject.h"
+#include "gtPlusCloudScheduler.h"
+
+#ifdef USE_OMP
+    #include "omp.h"
+#endif // USE_OMP
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+struct gtPlusReconJob2DT : public SerializableObject
+{
+    gtPlusReconWorkOrder<T> workOrder2DT;
+    hoNDArray<T> kspace;
+    hoNDArray<T> ker;
+    // hoNDArray<T> coilMap;
+
+    hoNDArray<T> complexIm;
+    hoNDArray<T> res;
+
+    size_t job_index_startN_;
+    size_t job_index_endN_;
+    size_t job_index_S_;
+
+    gtPlusReconJob2DT();
+    gtPlusReconJob2DT(const gtPlusReconJob2DT& job);
+
+    ~gtPlusReconJob2DT();
+
+    virtual bool serialize(char*& buf, size_t& len) const;
+    virtual bool deserialize(char* buf, size_t& len);
+};
+
+template <typename T> 
+gtPlusReconJob2DT<T>::gtPlusReconJob2DT()
+{
+
+}
+
+template <typename T> 
+gtPlusReconJob2DT<T>::~gtPlusReconJob2DT()
+{
+
+}
+
+template <typename T> 
+gtPlusReconJob2DT<T>::gtPlusReconJob2DT(const gtPlusReconJob2DT& job)
+{
+    job.workOrder2DT.duplicate(workOrder2DT);
+    workOrder2DT.coilMap_ = job.workOrder2DT.coilMap_;
+    kspace = job.kspace;
+    ker = job.ker;
+    // coilMap = job.coilMap;
+    complexIm = job.complexIm;
+    res = job.res;
+    job_index_startN_ = job.job_index_startN_;
+    job_index_endN_ = job.job_index_endN_;
+    job_index_S_ = job.job_index_S_;
+}
+
+template <typename T> 
+bool gtPlusReconJob2DT<T>::serialize(char*& buf, size_t& len) const
+{
+    char *bufKSpace(NULL), *bufKernel(NULL), *bufCoilMap(NULL), *bufComplexIm(NULL), *bufRes(NULL);
+    try
+    {
+        if ( buf != NULL ) delete[] buf;
+
+        // find the total len
+        gtPlusReconWorkOrderPara para;
+        para = this->workOrder2DT;
+
+        // buffer for kspace, kernel and coil map
+        size_t lenKSpace, lenKernel, lenCoilMap, lenComplexIm, lenRes;
+
+        GADGET_CHECK_THROW(kspace.serialize(bufKSpace, lenKSpace));
+        GADGET_CHECK_THROW(ker.serialize(bufKernel, lenKernel));
+
+        if ( workOrder2DT.coilMap_ )
+        {
+            GADGET_CHECK_THROW(workOrder2DT.coilMap_->serialize(bufCoilMap, lenCoilMap));
+        }
+        else
+        {
+            hoNDArray<T> coilMapDummy;
+            GADGET_CHECK_THROW(coilMapDummy.serialize(bufCoilMap, lenCoilMap));
+        }
+        GADGET_CHECK_THROW(complexIm.serialize(bufComplexIm, lenComplexIm));
+        GADGET_CHECK_THROW(res.serialize(bufRes, lenRes));
+
+        // total length
+        len = sizeof(gtPlusReconWorkOrderPara) + sizeof(size_t)*3 + lenKSpace + lenKernel + lenCoilMap + lenComplexIm + lenRes;
+
+        buf = new char[len];
+        GADGET_CHECK_RETURN_FALSE( buf != NULL );
+
+        size_t offset = 0, currLen=0;
+
+        currLen = sizeof(gtPlusReconWorkOrderPara);
+        memcpy(buf+offset, &para, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(buf+offset, &job_index_startN_, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(buf+offset, &job_index_endN_, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(buf+offset, &job_index_S_, currLen);
+        offset += currLen;
+
+        currLen = lenKSpace;
+        memcpy(buf+offset, bufKSpace, currLen);
+        offset += currLen;
+        delete [] bufKSpace;
+
+        currLen = lenKernel;
+        memcpy(buf+offset, bufKernel, currLen);
+        offset += currLen;
+        delete [] bufKernel;
+
+        currLen = lenCoilMap;
+        memcpy(buf+offset, bufCoilMap, currLen);
+        offset += currLen;
+        delete [] bufCoilMap;
+
+        currLen = lenComplexIm;
+        memcpy(buf+offset, bufComplexIm, currLen);
+        offset += currLen;
+        delete [] bufComplexIm;
+
+        currLen = lenRes;
+        memcpy(buf+offset, bufRes, currLen);
+        offset += currLen;
+        delete [] bufRes;
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors happened in gtPlusReconJob2DT<T>::serialize(...) ... ");
+
+        if ( bufKSpace != NULL ) delete [] bufKSpace;
+        if ( bufKernel != NULL ) delete [] bufKernel;
+        if ( bufCoilMap != NULL ) delete [] bufCoilMap;
+        if ( bufComplexIm != NULL ) delete [] bufComplexIm;
+        if ( bufRes != NULL ) delete [] bufRes;
+
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconJob2DT<T>::deserialize(char* buf, size_t& len)
+{
+    try
+    {
+        gtPlusReconWorkOrderPara para;
+        memcpy(&para, buf, sizeof(gtPlusReconWorkOrderPara));
+
+        workOrder2DT.copyFromPara(para);
+
+        size_t offset(sizeof(gtPlusReconWorkOrderPara)), currLen=0;
+
+        currLen = sizeof(size_t);
+        memcpy(&job_index_startN_, buf+offset, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(&job_index_endN_, buf+offset, currLen);
+        offset += currLen;
+
+        currLen = sizeof(size_t);
+        memcpy(&job_index_S_, buf+offset, currLen);
+        offset += currLen;
+
+        // kspace, kernel and coil map
+        GADGET_CHECK_RETURN_FALSE(kspace.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(ker.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        hoNDArray<T> coilMapDummy;
+        GADGET_CHECK_RETURN_FALSE(coilMapDummy.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        if ( coilMapDummy.get_number_of_elements() > 0 )
+        {
+            if ( workOrder2DT.coilMap_ )
+            {
+                *workOrder2DT.coilMap_ = coilMapDummy;
+            }
+            else
+            {
+                workOrder2DT.coilMap_ = boost::shared_ptr< hoNDArray<T> >( new hoNDArray<T>(coilMapDummy) );
+            }
+        }
+        else
+        {
+            if ( workOrder2DT.coilMap_ ) workOrder2DT.coilMap_->clear();
+        }
+
+        GADGET_CHECK_RETURN_FALSE(complexIm.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        GADGET_CHECK_RETURN_FALSE(res.deserialize(buf+offset, currLen));
+        offset += currLen;
+
+        // total length
+        len = offset;
+    }
+    catch (...)
+    {
+        GADGET_ERROR_MSG("Errors happended in gtPlusReconJob2DT<T>::deserialize(...) ...");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+class gtPlusReconWorker
+{
+public:
+
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker() : performTiming_(false)
+    {
+        gt_timer1_.set_timing_in_destruction(false);
+        gt_timer2_.set_timing_in_destruction(false);
+        gt_timer3_.set_timing_in_destruction(false);
+    }
+
+    virtual ~gtPlusReconWorker() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder<T>* workOrder) = 0;
+
+    virtual bool performPartialFourierHandling(gtPlusReconWorkOrder<T>* /*workOrder*/) { return true; }
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+    {
+        if ( workOrder == NULL ) return false;
+        return true;
+    }
+
+    // clock for timing
+    Gadgetron::GadgetronTimer gt_timer1_;
+    Gadgetron::GadgetronTimer gt_timer2_;
+    Gadgetron::GadgetronTimer gt_timer3_;
+
+    bool performTiming_;
+
+    // exporter
+    Gadgetron::gtPlus::gtPlusIOAnalyze gt_exporter_;
+
+    // debug folder
+    std::string debugFolder_;
+
+    // util
+    gtPlusISMRMRDReconUtil<T> gtPlus_util_;
+
+    // memory manager
+    boost::shared_ptr<gtPlusMemoryManager> gtPlus_mem_manager_;
+
+    // ----------------------------------------------------
+    // recon job splitter and combiner
+    // ----------------------------------------------------
+    // 2DT array, [RO E1 CHA N S]
+    // if splitByS is true, split jobs by each S
+    // if jobN > 0, every jobN 2D kspaces are assigned into one job
+    // if splitByS=false and jobN<=0, the jobMegaBytes is used to define the maximal size of every job 
+    // overlapN: the overlap along N dimension
+    virtual bool splitReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+                        bool splitByS, size_t jobN, size_t jobMegaBytes, size_t overlapN, 
+                        std::vector<gtPlusReconJob2DT<T> >& jobList);
+
+    virtual bool combineReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, std::vector<gtPlusReconJob2DT<T> >& jobList, size_t N, size_t S);
+
+    virtual bool createAReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+                            size_t startN, size_t endN, size_t indS, gtPlusReconJob2DT<T>& job);
+
+    // from the node computing power indexes, get the effective node number for job splitting
+    virtual bool computeEffectiveNodeNumberBasedOnComputingPowerIndex(gtPlusReconWorkOrder<T>* workOrder, size_t& numOfEffectiveNodes);
+
+    // estimate the job size, given the maximal memory usage for every job
+    virtual bool estimateJobSize(gtPlusReconWorkOrder<T>* workOrder, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize) = 0;
+
+    // given the number of nodes in a cloud and corresponding computing power indexes, spread the jobs on the nodes
+    virtual bool scheduleJobForNodes(gtPlusReconWorkOrder<T>* workOrder2DT, size_t numOfJobs, std::vector<int>& nodeIdForJob);
+};
+
+template <typename T> 
+bool gtPlusReconWorker<T>::createAReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+        size_t startN, size_t endN, size_t indS, gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        size_t kerRO = ker.get_size(0);
+        size_t kerE1 = ker.get_size(1);
+        size_t srcCHA = ker.get_size(2);
+        size_t dstCHA = ker.get_size(3);
+        size_t refN = ker.get_size(4);
+
+        size_t jobN = endN-startN+1;
+
+        job.kspace.create(RO, E1, srcCHA, jobN, 1);
+        memcpy(job.kspace.begin(), kspace.begin()+indS*RO*E1*srcCHA*N+startN*RO*E1*srcCHA, job.kspace.get_number_of_bytes());
+
+        if ( refN < N )
+        {
+            job.ker.create(kerRO, kerE1, srcCHA, dstCHA, refN, 1);
+            memcpy(job.ker.begin(), ker.begin()+indS*kerRO*kerE1*srcCHA*dstCHA*refN, job.ker.get_number_of_bytes());
+        }
+        else
+        {
+            job.ker.create(kerRO, kerE1, srcCHA, dstCHA, jobN, 1, ker.begin()+indS*kerRO*kerE1*srcCHA*dstCHA*refN+startN*kerRO*kerE1*srcCHA*dstCHA);
+        }
+
+        if ( workOrder2DT->coilMap_->get_number_of_elements() > 0 )
+        {
+            if ( refN < N )
+            {
+                job.workOrder2DT.coilMap_ = boost::shared_ptr<hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+indS*RO*E1*dstCHA*refN));
+            }
+            else
+            {
+                job.workOrder2DT.coilMap_ = boost::shared_ptr<hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, jobN, workOrder2DT->coilMap_->begin()+indS*RO*E1*dstCHA*refN+startN*RO*E1*dstCHA));
+            }
+        }
+
+        job.job_index_startN_ = startN;
+        job.job_index_endN_ = endN;
+        job.job_index_S_ = indS;
+        workOrder2DT->duplicate(job.workOrder2DT);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker<T>::createAReconJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::splitReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, 
+        bool splitByS, size_t jobN, size_t jobMegaBytes, size_t overlapN, 
+        std::vector<gtPlusReconJob2DT<T> >& jobList)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        size_t kerRO = ker.get_size(0);
+        size_t kerE1 = ker.get_size(1);
+        size_t srcCHA = ker.get_size(2);
+        size_t dstCHA = ker.get_size(3);
+        size_t refN = ker.get_size(4);
+
+        size_t n, s;
+        int startN, endN;
+
+        if ( splitByS )
+        {
+            jobList.resize(S);
+            startN = 0;
+            endN = N-1;
+            for ( s=0; s<S; s++ )
+            {
+                GADGET_CHECK_RETURN_FALSE(createAReconJob(workOrder2DT, kspace, ker, startN, endN, s, jobList[s]));
+            }
+
+            return true;
+        }
+
+        if ( jobN > 0 )
+        {
+            if ( jobN < 2*overlapN ) jobN = 2*overlapN;
+        }
+        else if ( jobMegaBytes > 0 )
+        {
+            jobN = jobMegaBytes/(kerRO*kerE1*srcCHA*dstCHA*sizeof(T)/1024/1024);
+            if ( jobN < 2*overlapN ) jobN = 2*overlapN;
+        }
+
+        jobList.clear();
+
+        // find number of jobs
+        size_t numPerN=0;
+        startN = 0;
+        while ( startN < N )
+        {
+            endN = startN+jobN+overlapN-1;
+            numPerN++;
+
+            if ( endN >= N )
+            {
+                endN = N-1;
+                break;
+            }
+
+            startN = endN-overlapN+1;
+        }
+
+        jobList.resize(S*numPerN);
+
+        for ( s=0; s<S; s++ )
+        {
+
+            size_t num=0;
+            startN = 0;
+            while ( startN < N )
+            {
+                endN = startN+jobN+overlapN-1;
+                num++;
+
+                if ( endN >= N )
+                {
+                    endN = N-1;
+
+                    if ( endN-startN+1 < jobN )
+                    {
+                        startN = endN-jobN+1;
+                        if ( startN < 0 ) startN = 0;
+                    }
+
+                    GADGET_CHECK_RETURN_FALSE(createAReconJob(workOrder2DT, kspace, ker, startN, endN, s, jobList[s*numPerN+num-1]));
+                    break;
+                }
+
+                GADGET_CHECK_RETURN_FALSE(createAReconJob(workOrder2DT, kspace, ker, startN, endN, s, jobList[s*numPerN+num-1]));
+
+                startN = endN-overlapN+1;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker<T>::splitReconJob(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::
+combineReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, std::vector<gtPlusReconJob2DT<T> >& jobList, size_t N, size_t S)
+{
+    try
+    {
+        size_t RO = jobList[0].kspace.get_size(0);
+        size_t E1 = jobList[0].kspace.get_size(1);
+
+        size_t srcCHA = jobList[0].ker.get_size(2);
+        size_t dstCHA = jobList[0].ker.get_size(3);
+        size_t refN = jobList[0].ker.get_size(4);
+
+        workOrder2DT->complexIm_.create(RO, E1, N, S);
+        Gadgetron::clear(workOrder2DT->complexIm_);
+
+        workOrder2DT->fullkspace_.create(RO, E1, dstCHA, N, S);
+        Gadgetron::clear(workOrder2DT->fullkspace_);
+
+        size_t ii, n, s;
+
+        size_t numOfJobs = jobList.size();
+
+        ho2DArray<T> fillingTimes(N, S);
+        Gadgetron::clear(fillingTimes);
+
+        for ( ii=0; ii<numOfJobs; ii++ )
+        {
+            size_t startN = jobList[ii].job_index_startN_;
+            size_t endN = jobList[ii].job_index_endN_;
+            size_t indS = jobList[ii].job_index_S_;
+
+            if ( jobList[ii].complexIm.get_number_of_elements() > 0 )
+            {
+                hoNDArray<T> complexIm(RO, E1, endN-startN+1, workOrder2DT->complexIm_.begin()+indS*RO*E1*N+startN*RO*E1);
+                Gadgetron::add(jobList[ii].complexIm, complexIm, complexIm);
+            }
+
+            if ( jobList[ii].res.get_number_of_elements() > 0 )
+            {
+                hoNDArray<T> fullkspace(RO, E1, dstCHA, endN-startN+1, workOrder2DT->fullkspace_.begin()+indS*RO*E1*dstCHA*N+startN*RO*E1*dstCHA);
+                Gadgetron::add(jobList[ii].res, fullkspace, fullkspace);
+            }
+
+            for ( n=startN; n<=endN; n++ )
+            {
+                fillingTimes(n, indS) = fillingTimes(n, indS) + T(1.0);
+            }
+        }
+
+        for ( s=0; s<S; s++ )
+        {
+            for ( n=0; n<N; n++ )
+            {
+                if ( fillingTimes(n, s).real() > 1 )
+                {
+                    hoNDArray<T> complexIm(RO, E1, workOrder2DT->complexIm_.begin()+s*RO*E1*N+n*RO*E1);
+                    Gadgetron::scal(1.0/fillingTimes(n, s).real(), complexIm);
+
+                    hoNDArray<T> fullkspace(RO, E1, dstCHA, workOrder2DT->fullkspace_.begin()+s*RO*E1*dstCHA*N+n*RO*E1*dstCHA);
+                    Gadgetron::scal(1.0/fillingTimes(n, s).real(), fullkspace);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker<T>::combineReconJob(gtPlusReconWorkOrder<T>* workOrder2DT, std::vector<gtPlusReconJob2DT<T> >& jobList, size_t N, size_t S) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::
+computeEffectiveNodeNumberBasedOnComputingPowerIndex(gtPlusReconWorkOrder<T>* workOrder, size_t& numOfEffectiveNodes)
+{
+    try
+    {
+        size_t numOfNodes = workOrder->gt_cloud_.size();
+        numOfEffectiveNodes = 0;
+
+        if ( numOfNodes == 0 )
+        {
+            GADGET_WARN_MSG("numOfNodes == 0");
+            return true;
+        }
+
+        double minPowerIndex = workOrder->gt_cloud_[0].get<3>();
+        double totalPowerIndex = minPowerIndex;
+
+        size_t ii;
+        for ( ii=1; ii<numOfNodes; ii++ )
+        {
+            totalPowerIndex += workOrder->gt_cloud_[ii].get<3>();
+            if ( workOrder->gt_cloud_[ii].get<3>() < minPowerIndex ) minPowerIndex = workOrder->gt_cloud_[ii].get<3>();
+        }
+
+        numOfEffectiveNodes = (size_t)(std::floor(totalPowerIndex/minPowerIndex));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker<T>::computeEffectiveNodeNumberBasedOnComputingPowerIndex(gtPlusReconWorkOrder<T>* workOrder, unsigned int& numOfEffectiveNodes) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker<T>::
+scheduleJobForNodes(gtPlusReconWorkOrder<T>* workOrder, size_t numOfJobs, std::vector<int>& nodeIdForJob)
+{
+    try
+    {
+        size_t numOfNodes = workOrder->gt_cloud_.size();
+
+        gtPlusCloudScheduler scheduler;
+        scheduler.setNumOfJobs(numOfJobs);
+
+        std::vector<double> powerIndexes(numOfNodes);
+        for ( size_t ii=0; ii<numOfNodes; ii++ )
+        {
+            powerIndexes[ii] = workOrder->gt_cloud_[ii].get<3>();
+        }
+
+        scheduler.setUpNodes(powerIndexes);
+
+        GADGET_CHECK_RETURN_FALSE(scheduler.schedulerJobs(nodeIdForJob));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker<T>::scheduleJobForNodes(gtPlusReconWorkOrder<T>* workOrder2DT, size_t numOfJobs, std::vector<int>& nodeIdForJob) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h
new file mode 100644
index 0000000..5a80d0d
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DT.h
@@ -0,0 +1,2532 @@
+/** \file   gtPlusISMRMRDReconWorker2DT.h
+    \brief  Define the base class for the GtPlus worker for 2DT reconstruction cases
+
+            Five different strategies were implemented for partial fourier or asymmetric echo acquisition, including:
+
+            ISMRMRD_PF_ZEROFILLING          : only zero filling the unacquired k-space
+
+            ISMRMRD_PF_ZEROFILLING_FILTER   : zero filling the unacquired k-space and apply a transition filter on the edges between
+                                              acquired and unacquired regions
+
+            ISMRMRD_PF_HOMODYNE             : perform the iterative homodyne filter
+                                              Handbook of MRI Pulse Sequences. Page 556.
+                                              Matt A. Bernstein, Kevin F. King, Xiaohong Joe Zhou. 
+                                              Academic Press, ISBN-10: 0120928612.
+
+            ISMRMRD_PF_POCS                 : perform the iterative POCS reconstruction
+                                              Magnetic Resonance Imaging: Physical Principles and Sequence Design. Page 296-297.
+                                              E. Mark Haacke, Robert W. Brown, Michael R. Thompson, Ramesh Venkatesan. 
+                                              Wiley-Liss, ISBN-10: 0471351288.
+
+            ISMRMRD_PF_FENGHUANG            : perform a k-space convolution based partial fourier reconstruction. 
+                                              This is our recommendation for 2D, 2DT cases.
+
+                                              Feng Huang, Wei Lin, and Yu Li. 
+                                              Partial Fourier Reconstruction Through Data Fitting and Convolution in k-Space.
+                                              Magnetic Resonance in Medicine, Vol 62, page 1261�1269, 2009.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DT : public gtPlusReconWorker<T>
+{
+public:
+
+    typedef gtPlusReconWorker<T> BaseClass;
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker2DT() : BaseClass(), startE1_(0), endE1_(1024) {}
+    virtual ~gtPlusReconWorker2DT() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder<T>* workOrder)
+    {
+        // check whether we have all-zeros input
+        value_type v(1);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::norm2(workOrder->data_, v));
+        if ( v <= 0 )
+        {
+            GADGET_WARN_MSG("gtPlusReconWorker2DT, performRecon(workOrder) : incoming data contains all-zeros ... ");
+
+            boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+            (*dims)[2] = workOrder->num_channels_res_;
+            workOrder->complexIm_.create(dims);
+            Gadgetron::clear(workOrder->complexIm_);
+
+            return true;
+        }
+
+        gtPlusReconWorkOrder2DT<T>* workOrder2DT = dynamic_cast<gtPlusReconWorkOrder2DT<T>*>(workOrder);
+        if ( workOrder2DT == NULL ) return false;
+
+        if ( workOrder2DT->recon_auto_parameters_ )
+        {
+            this->autoReconParameter(workOrder2DT);
+            GADGET_MSG("Gt Plus 2DT -- automatic paramter selection ---");
+            workOrder2DT->print(std::cout);
+        }
+
+        return this->performRecon(workOrder2DT);
+    }
+
+    // the common functionalities are performed here for 2DT recon
+    // compute the coil compression coefficients
+    // prepare the ref data array
+    virtual bool performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    virtual bool estimateCoilMap(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalib(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    // the partial fourier handling for the 2DT reconstruction
+    // the computation is performed on the reconstructed full kspace
+    virtual bool performPartialFourierHandling(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    // perform the kspace filter on ref data for coil map estimation
+    virtual bool performRefFilter(gtPlusReconWorkOrder2DT<T>* workOrder2DT, 
+                                        const hoNDArray<T>& ref, hoNDArray<T>& refFiltered, 
+                                        int startRO, int endRO, int startE1, int endE1);
+
+    // for interleave, compute mean ref
+    // for embedded and separate, squeeze out the zero lines
+    virtual bool prepRef(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, 
+                    hoNDArray<T>& refRecon, hoNDArray<T>& refCoilMap, 
+                    int startRO, int endRO, int startE1, int endE1, size_t dataE1);
+
+    // implement reference data preparation
+    virtual bool prepRefByAveragingCrossN(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon);
+
+    // compute coil compression coefficients
+    virtual bool coilCompression(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    // after unwrapping, for embedded and separate, the full res coil map may be estimated
+    // for embedded, the ref may be filled back to fullkspace
+    virtual bool afterUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    // pick the frame with highest signal from the 2DT buffer
+    // data: [RO E1 CHA N S], res: [RO E1 CHA 1 S]
+    bool pickHighestSignalForN(const hoNDArray<T>& data, hoNDArray<T>& res);
+
+    // ----------------------------------------------------
+    // common functions for 2DT reconstruction
+    // ----------------------------------------------------
+    // image domain kernel with coil sensitivity
+    // kerIm: [RO E1 srcCHA dstCHA]
+    // coilMap: [RO E1 dstCHA]
+    // unmixCoeff: [RO E1 srcCHA]
+    // gFactor: [RO E1]
+    bool unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor);
+
+    // apply image domain kernel
+    // kspace: [RO E1 srcCHA ...]
+    // complexIm : [RO E1 dstCHA ...]
+    bool applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 srcCHA ...]
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // for speed, a buffer can be provided
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm);
+
+    // apply unmixCoeff
+    // kspace: [RO E1 srcCHA ...]
+    // unmixCoeff : [RO E1 srcCHA]
+    // complexIm : [RO E1 ...]
+    bool applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 srcCHA ...]
+    bool applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+
+    // ----------------------------------------------------
+    // Partial fourier handling for 2DT reconstruction
+    // ----------------------------------------------------
+    // apply the partial fourier filer along the edges
+    bool performPartialFourierFilter(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+    // apply the iterative homodyne filter for partial fourier reconstruction
+    bool performPartialFourierHomodyneRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+    // apply the iterative POCS for partial fourier reconstruction
+    bool performPartialFourierPOCSRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+    // apply the Feng Huang partial fourier reconstruction
+    bool performPartialFourierFengHuangRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace);
+
+    // compute Feng Huang kernel and perform recon
+    bool calibFengHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel);
+    bool performReconFangHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, int startRO, int endRO, int startE1, int endE1, ho6DArray<T>& kernel);
+
+    // estimate the job size, given the maximal memory usage for every job
+    virtual bool estimateJobSize(gtPlusReconWorkOrder<T>* workOrder, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // helper memory for computation
+    hoNDArray<T> buffer2DT_;
+    hoNDArray<T> buffer2DT_unwrapping_;
+    hoNDArray<T> buffer2DT_partial_fourier_;
+    hoNDArray<T> buffer2DT_partial_fourier_kspaceIter_;
+    hoNDArray<T> ref_src_;
+    hoNDArray<T> ref_dst_;
+    hoNDArray<T> data_dst_;
+    hoNDArray<T> ref_coil_map_dst_;
+
+    // sampled region along E1
+    size_t startE1_;
+    size_t endE1_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performRefFilter(gtPlusReconWorkOrder2DT<T>* workOrder2DT, 
+                                        const hoNDArray<T>& ref, hoNDArray<T>& refFiltered, 
+                                        int startRO, int endRO, int startE1, int endE1)
+{
+    try
+    {
+        refFiltered = ref;
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+
+        if ( workOrder2DT->filterROE1_ref_.get_size(0)==RO && workOrder2DT->filterROE1_ref_.get_size(1)==E1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(ref, workOrder2DT->filterROE1_ref_, refFiltered));
+        }
+        else if ( (workOrder2DT->filterRO_ref_.get_number_of_elements()==RO) && (workOrder2DT->filterE1_ref_.get_number_of_elements()==E1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(ref, workOrder2DT->filterRO_ref_, workOrder2DT->filterE1_ref_, refFiltered));
+        }
+        else
+        {
+            if ( (workOrder2DT->filterRO_ref_.get_number_of_elements()==RO) && (workOrder2DT->filterE1_ref_.get_number_of_elements()!=E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(ref, workOrder2DT->filterRO_ref_, refFiltered));
+            }
+
+            if ( (workOrder2DT->filterRO_ref_.get_number_of_elements()!=RO) && (workOrder2DT->filterE1_ref_.get_number_of_elements()==E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(ref, workOrder2DT->filterE1_ref_, refFiltered));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performRefFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::prepRefByAveragingCrossN(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon)
+{
+    try
+    {
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t CHA = ref.get_size(2);
+        size_t N = ref.get_size(3);
+        size_t S = ref.get_size(4);
+
+        std::vector<size_t> sampledTimes;
+
+        if ( !averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            refRecon = ref;
+        }
+        else if ( averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            //GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(ref, refRecon));
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(ref, refRecon, sampledTimes));
+        }
+        else if ( averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            hoNDArray<T> refKLF(RO, E1, CHA, N, S);
+
+            size_t s;
+            for ( s=0; s<S; s++ )
+            {
+                hoMatrix<T> A(RO*E1*CHA, N, const_cast<T*>(ref.begin()+s*RO*E1*CHA*N));
+                hoMatrix<T> A_KLF(RO*E1*CHA, N, refKLF.begin()+s*RO*E1*CHA*N);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refKLF, "refKLF");
+
+            //GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(refKLF, refRecon));
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(refKLF, refRecon, sampledTimes));
+        }
+        else if ( !averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            refRecon.create(RO, E1, CHA, N, S);
+
+            size_t s;
+            for ( s=0; s<S; s++ )
+            {
+                hoMatrix<T> A(RO*E1*CHA, N, const_cast<T*>(ref.begin()+s*RO*E1*CHA*N));
+                hoMatrix<T> A_KLF(RO*E1*CHA, N, refRecon.begin()+s*RO*E1*CHA*N);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+            }
+        }
+        else
+        {
+            refRecon = ref;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::prepRefByAveragingCrossN(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::prepRef(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref, 
+                    hoNDArray<T>& refRecon, hoNDArray<T>& refCoilMap, 
+                    int startRO, int endRO, int startE1, int endE1, size_t dataE1)
+{
+    try
+    {
+        size_t dataRO = workOrder2DT->data_.get_size(0);
+        size_t dataS = workOrder2DT->data_.get_size(4);
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t srcCHA = ref.get_size(2);
+        size_t N = ref.get_size(3);
+        size_t S = ref.get_size(4);
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref, "ref");
+
+        if ( workOrder2DT->acceFactorE1_ == 1 )
+        {
+            if ( workOrder2DT->no_acceleration_averageall_ref_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, ref, workOrder2DT->no_acceleration_averageall_ref_, workOrder2DT->no_acceleration_ref_numOfModes_, refRecon));
+            }
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1));
+        }
+        else if ( workOrder2DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, ref, true, workOrder2DT->interleaved_ref_numOfModes_, refRecon));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon_interleaved");
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1));
+
+            if ( (startRO>=0 && endRO>0 && endRO>startRO) || (startE1>=0 && endE1>0 && endE1>startE1) )
+            {
+                std::vector<size_t> crop_offset(5), crop_size(5);
+
+                crop_offset[0] = 0;
+                crop_offset[1] = 0;
+                crop_offset[2] = 0;
+                crop_offset[3] = 0;
+                crop_offset[4] = 0;
+
+                crop_size[0] = RO;
+                crop_size[1] = E1;
+                crop_size[2] = refRecon.get_size(2);
+                crop_size[3] = refRecon.get_size(3);
+                crop_size[4] = refRecon.get_size(4);
+
+                if (startRO>=0 && endRO>0 && endRO>startRO)
+                {
+                    crop_offset[0] = startRO;
+                    crop_size[0] = endRO-startRO+1;
+                }
+
+                if (startE1>=0 && endE1>0 && endE1>startE1)
+                {
+                    crop_offset[1] = startE1;
+                    crop_size[1] = endE1-startE1+1;
+                }
+
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+        }
+        else if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded 
+                || workOrder2DT->CalibMode_ == ISMRMRD_separate 
+                || workOrder2DT->CalibMode_ == ISMRMRD_external )
+        {
+            if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                refRecon = ref;
+            }
+
+            if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, ref, workOrder2DT->separate_averageall_ref_, workOrder2DT->separate_ref_numOfModes_, refRecon));
+            }
+
+            hoNDArray<typename realType<T>::Type> refMag(refRecon.get_dimensions()), refMagSum;
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(refRecon, refMag));
+            GADGET_CHECK_RETURN_FALSE(sumOverLastDimension(refMag, refMagSum));
+            GADGET_CHECK_RETURN_FALSE(sumOverLastDimension(refMagSum, refMag));
+            GADGET_CHECK_RETURN_FALSE(sumOverLastDimension(refMag, refMagSum));
+
+            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<float>().detectSampledRegionE1(refMagSum, startE1_, endE1_));
+
+            std::vector<size_t> crop_offset(5);
+            crop_offset[0] = 0;
+            crop_offset[1] = startE1_;
+            crop_offset[2] = 0;
+            crop_offset[3] = 0;
+            crop_offset[4] = 0;
+
+            std::vector<size_t> crop_size(5);
+            crop_size[0] = refRecon.get_size(0);
+            crop_size[1] = endE1_-startE1_+1;
+            crop_size[2] = srcCHA;
+            crop_size[3] = refRecon.get_size(3);
+            crop_size[4] = refRecon.get_size(4);
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon_beforeCrop");
+
+            if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, croppedRef, "refRecon_afterCrop");
+
+                if ( workOrder2DT->recon_algorithm_ == ISMRMRD_SPIRIT || workOrder2DT->recon_algorithm_ == ISMRMRD_L1SPIRIT )
+                {
+                    // copy the ref into the data
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongE1(refRecon, workOrder2DT->data_, startE1_, endE1_));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->data_, "data_copyAlongE1");
+                }
+
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder2DT, croppedRef, workOrder2DT->embedded_averageall_ref_, workOrder2DT->embedded_ref_numOfModes_, refRecon));
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon_afterCrop_prepCrossN");
+
+                crop_size[3] = refRecon.get_size(3);
+
+                refCoilMap.create(RO, E1, srcCHA, refRecon.get_size(3), S);
+                GADGET_CHECK_RETURN_FALSE(setSubArrayUpTo10DArray(refRecon, refCoilMap, crop_offset, crop_size));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap");
+
+                hoNDArray<T> refCoilMapTmp(refCoilMap);
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, refCoilMapTmp, refCoilMap, startRO, endRO, startE1, endE1));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap_filtered");
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+                    }
+                }
+
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+            else
+            {
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, croppedRef, "croppedRef");
+
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder2DT, croppedRef, refCoilMap, startRO, endRO, startE1, endE1));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "croppedRef_filtered");
+
+                refRecon = croppedRef;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.zeropad2D(refCoilMap, dataRO, dataE1, croppedRef));
+                refCoilMap = croppedRef;
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap");
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+
+                        GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                        refRecon = croppedRef;
+                    }
+                }
+            }
+
+            if ( S < dataS )
+            {
+                hoNDArray<T> refReconDataS(refRecon.get_size(0), refRecon.get_size(1), refRecon.get_size(2), refRecon.get_size(3), dataS);
+                hoNDArray<T> refCoilMapDataS(refCoilMap.get_size(0), refCoilMap.get_size(1), refCoilMap.get_size(2), refCoilMap.get_size(3), dataS);
+
+                memcpy(refReconDataS.begin(), refRecon.begin(), refRecon.get_number_of_bytes());
+                memcpy(refCoilMapDataS.begin(), refCoilMap.begin(), refCoilMap.get_number_of_bytes());
+
+                size_t refReconN4D = refRecon.get_size(0)*refRecon.get_size(1)*refRecon.get_size(2)*refRecon.get_size(3);
+                size_t refCoilMapN4D = refCoilMap.get_size(0)*refCoilMap.get_size(1)*refCoilMap.get_size(2)*refCoilMap.get_size(3);
+
+                size_t s;
+                for ( s=S; s<dataS; s++ )
+                {
+                    memcpy(refReconDataS.begin()+s*refReconN4D, refRecon.begin()+(S-1)*refReconN4D, sizeof(T)*refReconN4D);
+                    memcpy(refCoilMapDataS.begin()+s*refCoilMapN4D, refCoilMap.begin()+(S-1)*refCoilMapN4D, sizeof(T)*refCoilMapN4D);
+                }
+
+                refRecon = refReconDataS;
+                refCoilMap = refCoilMapDataS;
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("CalibMode is not supported in gtPlusReconWorker2DT<T>::prepRef(...) : " << workOrder2DT->CalibMode_);
+            return false;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::prepRef(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::coilCompression(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    // the 2DT recon on 5D array [RO E1 CHA N S]
+    try
+    {
+        size_t RO = workOrder2DT->ref_recon_.get_size(0);
+        size_t E1 = workOrder2DT->ref_recon_.get_size(1);
+        size_t srcCHA = workOrder2DT->ref_recon_.get_size(2);
+        size_t N = workOrder2DT->ref_recon_.get_size(3);
+        size_t S = workOrder2DT->ref_recon_.get_size(4);
+
+        size_t dataS = workOrder2DT->data_.get_size(4);
+
+        if ( workOrder2DT->acceFactorE1_ == 1 ) return true;
+
+        // compute coil compression coeff
+        if ( workOrder2DT->coil_compression_ )
+        {
+            // check whether coil compression coeff has been preset
+            if ( workOrder2DT->coilCompressionCoef_->size()!=S )
+            {
+                if ( workOrder2DT->same_coil_compression_coeff_allS_ )
+                {
+                    hoNDArray<T> aveAllS;
+
+                    std::vector<size_t> allSDim(4);
+                    allSDim[0] = RO;
+                    allSDim[1] = E1;
+                    allSDim[2] = srcCHA;
+                    allSDim[3] = N*S;
+
+                    hoNDArray<T> dataAllS(&allSDim, workOrder2DT->ref_recon_.begin(), false);
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(dataAllS, aveAllS));
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aveAllS, "aveAllS");
+
+                    hoMatrix<T> coeff, eigenValues;
+                    if ( workOrder2DT->coil_compression_num_modesKept_ > 0 )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAllS, 
+                                    workOrder2DT->coil_compression_num_modesKept_, coeff, eigenValues));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAllS, 
+                                    workOrder2DT->coil_compression_thres_, coeff, eigenValues));
+                    }
+
+                    workOrder2DT->coilCompressionCoef_->resize(dataS);
+
+                    size_t s;
+                    for ( s=0; s<dataS; s++ )
+                    {
+                        (*workOrder2DT->coilCompressionCoef_)[s] = coeff;
+                    }
+
+                    GADGET_CHECK_PERFORM(!debugFolder_.empty(), eigenValues.print(std::cout));
+                    GADGET_MSG("Coil compression, number of channel kept is " << coeff.cols());
+                }
+                else
+                {
+                    std::vector<size_t> allSDim(4);
+                    allSDim[0] = RO;
+                    allSDim[1] = E1;
+                    allSDim[2] = srcCHA;
+                    allSDim[3] = N;
+
+                    size_t num_modesKept = srcCHA;
+
+                    size_t s;
+                    for ( s=0; s<S; s++ )
+                    {
+                        hoNDArray<T> dataCurrS(&allSDim, workOrder2DT->ref_recon_.begin()+s*RO*E1*srcCHA*N, false);
+
+                        hoMatrix<T> coeff, eigenValues;
+
+                        if ( s == 0 )
+                        {
+                            if ( workOrder2DT->coil_compression_num_modesKept_ > 0 )
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                            workOrder2DT->coil_compression_num_modesKept_, coeff, eigenValues));
+                            }
+                            else
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                            workOrder2DT->coil_compression_thres_, coeff, eigenValues));
+                            }
+
+                            num_modesKept = coeff.get_size(1);
+                            workOrder2DT->coilCompressionCoef_->push_back(coeff);
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrS, 
+                                            (int)num_modesKept, coeff, eigenValues));
+
+                            workOrder2DT->coilCompressionCoef_->push_back(coeff);
+                        }
+
+                        GADGET_CHECK_PERFORM(!debugFolder_.empty(), eigenValues.print(std::cout));
+                        GADGET_MSG("Coil compression, number of channel kept is " << coeff.cols());
+                    }
+
+                    if ( S < dataS )
+                    {
+                        std::vector<hoMatrix<T> > coilCompressionCoef(dataS);
+                        for ( s=0; s<S; s++ )
+                        {
+                            coilCompressionCoef[s] = (*workOrder2DT->coilCompressionCoef_)[s];
+                        }
+
+                        for ( s=S; s<dataS; s++ )
+                        {
+                            coilCompressionCoef[s] = (*workOrder2DT->coilCompressionCoef_)[S-1];
+                        }
+
+                        *(workOrder2DT->coilCompressionCoef_) = coilCompressionCoef;
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::coilCompression(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    // the 2DT recon on 5D array [RO E1 CHA N S]
+    try
+    {
+        if ( !workOrder2DT->workFlow_use_BufferedKernel_ )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("prepRef"));
+            GADGET_CHECK_RETURN_FALSE(prepRef(workOrder2DT, workOrder2DT->ref_, workOrder2DT->ref_recon_, workOrder2DT->ref_coil_map_, 
+                        workOrder2DT->start_RO_, workOrder2DT->end_RO_, workOrder2DT->start_E1_, workOrder2DT->end_E1_, workOrder2DT->data_.get_size(1)));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("coilCompression"));
+            GADGET_CHECK_RETURN_FALSE(coilCompression(workOrder2DT));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+
+         // apply coil compression coefficients
+        if ( workOrder2DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( workOrder2DT->coil_compression_ )
+            {
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->data_, "data_");
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->data_, *workOrder2DT->coilCompressionCoef_, data_dst_));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, data_dst_, "data_dst_");
+            }
+            else
+            {
+                data_dst_ = workOrder2DT->data_;
+            }
+        }
+        else
+        {
+            if ( workOrder2DT->coil_compression_ && workOrder2DT->acceFactorE1_>1 )
+            {
+                ref_src_ = workOrder2DT->ref_recon_;
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_src_, "ref_src_");
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(ref_src_, *workOrder2DT->coilCompressionCoef_, ref_dst_));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_dst_, "ref_dst_");
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->data_, "data_");
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->data_, *workOrder2DT->coilCompressionCoef_, data_dst_));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, data_dst_, "data_dst_");
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->ref_coil_map_, "ref_coil_map_");
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder2DT->ref_coil_map_, *workOrder2DT->coilCompressionCoef_, ref_coil_map_dst_));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_coil_map_dst_, "ref_coil_map_dst_");
+
+                if ( !workOrder2DT->downstream_coil_compression_ || workOrder2DT->recon_algorithm_==ISMRMRD_SPIRIT || workOrder2DT->recon_algorithm_==ISMRMRD_L1SPIRIT )
+                {
+                    ref_src_ = ref_dst_;
+                }
+            }
+            else
+            {
+                ref_src_ = workOrder2DT->ref_recon_;
+                ref_dst_ = workOrder2DT->ref_recon_;
+                data_dst_ = workOrder2DT->data_;
+                ref_coil_map_dst_ = workOrder2DT->ref_coil_map_;
+            }
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("estimateCoilMap"));
+            GADGET_CHECK_RETURN_FALSE(this->estimateCoilMap(workOrder2DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+            if ( workOrder2DT->acceFactorE1_>1 )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performCalib"));
+                GADGET_CHECK_RETURN_FALSE(this->performCalib(workOrder2DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+            }
+        }
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performUnwrapping"));
+        GADGET_CHECK_RETURN_FALSE(this->performUnwrapping(workOrder2DT, data_dst_));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("afterUnwrapping"));
+        GADGET_CHECK_RETURN_FALSE(this->afterUnwrapping(workOrder2DT));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+estimateCoilMap(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_coil_map_dst.get_size(2);
+
+        bool same_combinationcoeff_allS = false;
+        size_t whichS_combinationcoeff = 0;
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allS = workOrder2DT->interleaved_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->interleaved_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allS = workOrder2DT->embedded_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->embedded_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allS = workOrder2DT->separate_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->separate_whichS_combinationcoeff_;
+        }
+
+        if ( whichS_combinationcoeff >= S ) whichS_combinationcoeff=S-1;
+
+        // if the coil map has not been preset
+        if ( (workOrder2DT->coilMap_->get_size(0)!=RO) 
+            || (workOrder2DT->coilMap_->get_size(1)!=E1)
+            || (workOrder2DT->coilMap_->get_size(4)!=S) )
+        {
+            if ( same_combinationcoeff_allS )
+            {
+                size_t usedS = whichS_combinationcoeff;
+
+                hoNDArray<T> refCoilMapS(RO, E1, dstCHA, refN, const_cast<T*>(ref_coil_map_dst.begin()+usedS*RO*E1*dstCHA*refN));
+
+                workOrder2DT->coilMap_->create(RO, E1, dstCHA, refN, S);
+
+                hoNDArray<T> coilMapS(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(refCoilMapS, buffer2DT_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, 
+                        workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, usedS));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(ref_coil_map_dst, buffer2DT_));
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, 
+                        workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+            }
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder2DT->coilMap_, "coilMap_");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::estimateCoilMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+performCalib(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_coil_map_dst.get_size(2);
+
+        bool same_combinationcoeff_allS = false;
+        size_t whichS_combinationcoeff = 0;
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allS = workOrder2DT->interleaved_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->interleaved_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allS = workOrder2DT->embedded_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->embedded_whichS_combinationcoeff_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allS = workOrder2DT->separate_same_combinationcoeff_allS_;
+            whichS_combinationcoeff = workOrder2DT->separate_whichS_combinationcoeff_;
+        }
+
+        if ( whichS_combinationcoeff >= S ) whichS_combinationcoeff=S-1;
+
+        // calibration
+        if ( (workOrder2DT->kernelIm_->get_size(0)!=RO) 
+                || (workOrder2DT->kernelIm_->get_size(1)!=E1)
+                || (workOrder2DT->kernelIm_->get_size(2)!=srcCHA)
+                || (workOrder2DT->kernelIm_->get_size(3)!=dstCHA)
+                || (workOrder2DT->kernelIm_->get_size(5)!=S) )
+        {
+            GADGET_CHECK_RETURN_FALSE(this->performCalibPrep(ref_src, ref_dst, workOrder2DT));
+
+            size_t n;
+
+            // perform calibration
+            if ( same_combinationcoeff_allS )
+            {
+                size_t usedS = whichS_combinationcoeff;
+
+                for ( n=0; n<refN; n++ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(this->performCalibImpl(ref_src, ref_dst, workOrder2DT, n, usedS));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->kernel_, usedS));
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->kernelIm_, usedS));
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->unmixingCoeffIm_, usedS));
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(workOrder2DT->gfactor_, usedS));
+            }
+            else
+            {
+                int usedS;
+                #ifdef USE_OMP
+                    omp_set_nested(1);
+                #endif // USE_OMP
+
+                #ifdef GCC_OLD_FLAG
+                    #pragma omp parallel for default(none) private(usedS) shared(S, refN, workOrder2DT) if (S>1)
+                #else
+                    #pragma omp parallel for default(none) private(usedS) shared(S, refN, ref_src, ref_dst, workOrder2DT) if (S>1)
+                #endif
+                for ( usedS=0; usedS<(int)S; usedS++ )
+                {
+                    for ( size_t n=0; n<refN; n++ )
+                    {
+                        this->performCalibImpl(ref_src, ref_dst, workOrder2DT, n, usedS);
+                    }
+                }
+
+                #ifdef USE_OMP
+                    omp_set_nested(0);
+                #endif // USE_OMP
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performCalib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+performCalibPrep(const hoNDArray<T>& , const hoNDArray<T>& , gtPlusReconWorkOrder2DT<T>* /*workOrder2DT*/)
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+performCalibImpl(const hoNDArray<T>& , const hoNDArray<T>& , gtPlusReconWorkOrder2DT<T>* , size_t , size_t )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performUnwrapping(gtPlusReconWorkOrder2DT<T>* , const hoNDArray<T>& )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(2)==dstCHA);
+
+        unmixCoeff.create(RO, E1, srcCHA);
+        Gadgetron::clear(&unmixCoeff);
+
+        gFactor.create(RO, E1);
+        Gadgetron::clear(&gFactor);
+
+        int src;
+
+        T* pKerIm = const_cast<T*>(kerIm.begin());
+        T* pCoilMap = const_cast<T*>(coilMap.begin());
+        T* pCoeff = unmixCoeff.begin();
+
+        std::vector<size_t> dim(2);
+        dim[0] = RO;
+        dim[1] = E1;
+
+        #pragma omp parallel default(none) private(src) shared(RO, E1, srcCHA, dstCHA, pKerIm, pCoilMap, pCoeff, dim)
+        {
+            hoNDArray<T> coeff2D, coeffTmp(&dim);
+            hoNDArray<T> coilMap2D;
+            hoNDArray<T> kerIm2D;
+
+            #pragma omp for
+            for ( src=0; src<(int)srcCHA; src++ )
+            {
+                coeff2D.create(&dim, pCoeff+src*RO*E1);
+
+                for ( size_t dst=0; dst<dstCHA; dst++ )
+                {
+                    kerIm2D.create(&dim, pKerIm+src*RO*E1+dst*RO*E1*srcCHA);
+                    coilMap2D.create(&dim, pCoilMap+dst*RO*E1);
+                    Gadgetron::multiplyConj(kerIm2D, coilMap2D, coeffTmp);
+                    Gadgetron::add(coeff2D, coeffTmp, coeff2D);
+                }
+            }
+        }
+
+        hoNDArray<T> conjUnmixCoeff(unmixCoeff);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyConj(unmixCoeff, conjUnmixCoeff, conjUnmixCoeff));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(conjUnmixCoeff, gFactor));
+        Gadgetron::sqrt_inplace(&gFactor);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==srcCHA);
+
+        buffer2DT_unwrapping_ = kspace;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, buffer2DT_unwrapping_));
+
+        GADGET_CHECK_RETURN_FALSE(applyImageDomainKernelImage(buffer2DT_unwrapping_, kerIm, complexIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    hoNDArray<T> buf4D(kerIm.get_dimensions());
+    return applyImageDomainKernelImage(aliasedIm, kerIm, buf4D, complexIm);
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t srcCHA = kerIm.get_size(2);
+        size_t dstCHA = kerIm.get_size(3);
+
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==srcCHA);
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+
+        std::vector<size_t> dimIm(*dim);
+        dimIm[2] = dstCHA;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        std::vector<size_t> dim3D(3);
+        dim3D[0] = RO;
+        dim3D[1] = E1;
+        dim3D[2] = srcCHA;
+
+        std::vector<size_t> dimIm3D(3);
+        dimIm3D[0] = RO;
+        dimIm3D[1] = E1;
+        dimIm3D[2] = dstCHA;
+
+        size_t num = aliasedIm.get_number_of_elements()/ (RO*E1*srcCHA);
+
+        int n;
+
+        if ( num <= 16 )
+        {
+            for ( n=0; n<(int)num; n++ )
+            {
+                hoNDArray<T> buf3D(&dim3D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*srcCHA));
+                hoNDArray<T> bufIm3D(&dimIm3D, complexIm.begin()+n*RO*E1*dstCHA);
+
+                Gadgetron::multipleMultiply(buf3D, kerIm, kerImBuffer);
+                Gadgetron::sumOverSecondLastDimension(kerImBuffer, bufIm3D);
+            }
+        }
+        else
+        {
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(n) shared(num, dim3D, RO, E1, srcCHA, dimIm3D, dstCHA)
+            #else            
+                #pragma omp parallel default(none) private(n) shared(kerIm, num, dim3D, aliasedIm, RO, E1, srcCHA, dimIm3D, dstCHA, complexIm) 
+            #endif
+            {
+                hoNDArray<T> buf3D;
+                hoNDArray<T> bufIm3D;
+                hoNDArray<T> buf4D(kerIm.get_dimensions());
+
+                #pragma omp for
+                for ( n=0; n<(int)num; n++ )
+                {
+                    buf3D.create(&dim3D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*srcCHA));
+                    bufIm3D.create(&dimIm3D, complexIm.begin()+n*RO*E1*dstCHA);
+
+                    Gadgetron::multipleMultiply(buf3D, kerIm, buf4D);
+                    Gadgetron::sumOverSecondLastDimension(buf4D, bufIm3D);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==unmixCoeff.get_size(2));
+
+        buffer2DT_unwrapping_ = kspace;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, buffer2DT_unwrapping_));
+        GADGET_CHECK_RETURN_FALSE(applyUnmixCoeffImage(buffer2DT_unwrapping_, unmixCoeff, complexIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==unmixCoeff.get_size(2));
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+
+        std::vector<size_t> dimIm(*dim);
+        dimIm[2] = 1;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        // hoNDArray<T> tmp(aliasedIm);
+        buffer2DT_unwrapping_ = aliasedIm;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(unmixCoeff, aliasedIm, buffer2DT_unwrapping_));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver3rdDimension(buffer2DT_unwrapping_, complexIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::afterUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        bool fullres_coilmap = false;
+        bool fullres_coilmap_useHighestSignal = false;
+        bool ref_fillback = false;
+        bool averageallN_coilmap = false;
+        int numOfModesKept = 0;
+        bool same_coilmap_allS = false;
+        size_t whichS_coilmap = 0;
+
+        size_t RO = workOrder2DT->kernelIm_->get_size(0);
+        size_t E1 = workOrder2DT->kernelIm_->get_size(1);
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder2DT->embedded_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+                fullres_coilmap_useHighestSignal = workOrder2DT->embedded_fullres_coilmap_useHighestSignal_;
+            }
+
+            if ( workOrder2DT->embedded_ref_fillback_ 
+                && (workOrder2DT->recon_algorithm_!=ISMRMRD_SPIRIT) 
+                && (workOrder2DT->recon_algorithm_!=ISMRMRD_L1SPIRIT) )
+            {
+                ref_fillback = true;
+            }
+
+            if ( workOrder2DT->embedded_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder2DT->embedded_same_combinationcoeff_allS_ )
+            {
+                same_coilmap_allS = true;
+                whichS_coilmap = workOrder2DT->embedded_whichS_combinationcoeff_;
+            }
+
+            numOfModesKept = workOrder2DT->embedded_ref_numOfModes_;
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            if ( workOrder2DT->separate_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+            }
+
+            if ( workOrder2DT->separate_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder2DT->separate_same_combinationcoeff_allS_ )
+            {
+                same_coilmap_allS = true;
+                whichS_coilmap = workOrder2DT->separate_whichS_combinationcoeff_;
+            }
+
+            numOfModesKept = workOrder2DT->separate_ref_numOfModes_;
+        }
+
+        if ( whichS_coilmap >= S ) whichS_coilmap = S-1;
+
+        if ( ref_fillback )
+        {
+            hoNDArray<T> ref_dst;
+            if ( workOrder2DT->coil_compression_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.applyKLCoilCompressionCoeff(workOrder2DT->ref_, *workOrder2DT->coilCompressionCoef_, ref_dst));
+            }
+            else
+            {
+                ref_dst = workOrder2DT->ref_;
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_dst, "ref_dst");
+
+            if ( (ref_dst.get_size(2)==dstCHA) && (ref_dst.get_size(3)==N) && (ref_dst.get_size(4)==S) )
+            {
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->fullkspace_, "fullkspace_");
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongE1(ref_dst, workOrder2DT->fullkspace_, startE1_, endE1_));
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->fullkspace_, "fullkspace_After");
+            }
+        }
+
+        // partial fourier handling
+        GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder2DT));
+
+        if ( fullres_coilmap )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : allocate buffer 2DT ...  "));
+            hoNDArrayMemoryManaged<T> buffer2DT_Two(workOrder2DT->fullkspace_.get_dimensions(), gtPlus_mem_manager_);
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->fullkspace_, buffer2DT_, buffer2DT_Two));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_, "ComplexIm_afterRefFill");
+
+            if ( averageallN_coilmap )
+            {
+                if ( workOrder2DT->workFlow_use_BufferedKernel_ && workOrder2DT->coilMap_->get_size(3)==1 && workOrder2DT->coilMap_->get_size(4)==S )
+                {
+                    size_t s;
+                    for ( s=0; s<S; s++ )
+                    {
+                        hoNDArray<T> coilMapS(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+s*RO*E1*dstCHA);
+                        hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+s*RO*E1*dstCHA*N);
+                        hoNDArray<T> complexImCombinedS(RO, E1, N, workOrder2DT->complexIm_.begin()+s*RO*E1*N);
+
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS));
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImCombinedS, "complexImCombinedS");
+                    }
+                }
+                else
+                {
+                    workOrder2DT->coilMap_->create(RO, E1, dstCHA, 1, S);
+
+                    size_t s;
+
+                    if ( same_coilmap_allS )
+                    {
+                        hoNDArray<T> aveComplexImS(RO, E1, dstCHA, 1);
+
+                        buffer2DT_unwrapping_.create(RO, E1, dstCHA, N);
+
+                        hoMatrix<T> A(RO*E1*dstCHA, N, buffer2DT_.begin()+whichS_coilmap*RO*E1*dstCHA*N);
+                        hoMatrix<T> A_KLF(RO*E1*dstCHA, N, buffer2DT_unwrapping_.begin());
+
+                        if ( numOfModesKept>0 && numOfModesKept<dstCHA )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModesKept, A_KLF));
+                        }
+                        else
+                        {
+                            memcpy(A_KLF.begin(), A.begin(), A_KLF.get_number_of_bytes());
+                        }
+
+                        if ( fullres_coilmap_useHighestSignal )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(pickHighestSignalForN(buffer2DT_unwrapping_, aveComplexImS));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(buffer2DT_unwrapping_, aveComplexImS));
+                        }
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aveComplexImS, "aveComplexImS");
+
+                        hoNDArray<T> coilMapS(RO, E1, dstCHA, 1, workOrder2DT->coilMap_->begin()+whichS_coilmap*RO*E1*dstCHA);
+
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("coilMap2DNIH ...  "));
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, coilMapS, "coilMapS");
+
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, whichS_coilmap));
+
+                        for ( s=0; s<S; s++ )
+                        {
+                            hoNDArray<T> coilMapS(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+s*RO*E1*dstCHA);
+                            hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+s*RO*E1*dstCHA*N);
+                            hoNDArray<T> complexImCombinedS(RO, E1, N, workOrder2DT->complexIm_.begin()+s*RO*E1*N);
+
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS));
+                            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImCombinedS, "complexImCombinedS");
+                        }
+                    }
+                    else
+                    {
+                        hoNDArray<T> aveComplexIm(RO, E1, dstCHA, 1, S);
+                        buffer2DT_unwrapping_ = buffer2DT_;
+
+                        if ( numOfModesKept>0 && numOfModesKept<dstCHA )
+                        {
+                            for ( s=0; s<S; s++ )
+                            {
+                                hoMatrix<T> A(RO*E1*dstCHA, N, buffer2DT_.begin()+s*RO*E1*dstCHA*N);
+                                hoMatrix<T> A_KLF(RO*E1*dstCHA, N, buffer2DT_unwrapping_.begin()+s*RO*E1*dstCHA*N);
+
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModesKept, A_KLF));
+                            }
+
+                            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_unwrapping_, "ComplexIm_KLF");
+                        }
+
+                        if ( fullres_coilmap_useHighestSignal )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(pickHighestSignalForN(buffer2DT_unwrapping_, aveComplexIm));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(buffer2DT_unwrapping_, aveComplexIm));
+                        }
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aveComplexIm, "aveComplexIm");
+
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("coilMap2DNIH ...  "));
+
+                        gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexIm, *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_);
+
+                        gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_);
+
+                        //long long ss;
+                        //#pragma omp parallel for private(s) if (S>2)
+                        //for ( ss=0; ss<S; ss++ )
+                        //{
+                        //    hoNDArray<T> aveComplexImS(RO, E1, dstCHA, aveComplexIm.begin()+ss*RO*E1*dstCHA);
+                        //    hoNDArray<T> coilMapS(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+ss*RO*E1*dstCHA);
+
+                        //    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_));
+                        //    gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(aveComplexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_);
+
+                        //    hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+ss*RO*E1*dstCHA*N);
+                        //    hoNDArray<T> complexImCombinedS(RO, E1, N, workOrder2DT->complexIm_.begin()+ss*RO*E1*N);
+
+                        //    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS));
+                        //    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImS, coilMapS, complexImCombinedS);
+                        //}
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder2DT->coilMap_, "coilMap_fullres");
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->complexIm_, "complexImCombined");
+                    }
+                }
+            }
+            else
+            {
+                if ( workOrder2DT->workFlow_use_BufferedKernel_ && workOrder2DT->coilMap_->get_size(3)==N && workOrder2DT->coilMap_->get_size(4)==S )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->complexIm_, "complexIm_");
+                }
+                else
+                {
+                    workOrder2DT->coilMap_->create(RO, E1, dstCHA, N, S);
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("coilMap2DNIH ...  "));
+                    if ( same_coilmap_allS )
+                    {
+                        hoNDArray<T> complexImS(RO, E1, dstCHA, N, buffer2DT_.begin()+whichS_coilmap*RO*E1*dstCHA*N);
+                        hoNDArray<T> coilMapS(RO, E1, dstCHA, N, workOrder2DT->coilMap_->begin()+whichS_coilmap*RO*E1*dstCHA*N);
+
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(complexImS, coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, whichS_coilmap));
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder2DT->coilMap_, "coilMap_fullres");
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder2DT->coilMap_, "coilMap_fullres");
+                    }
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine(buffer2DT_, *workOrder2DT->coilMap_, workOrder2DT->complexIm_));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->complexIm_, "complexIm_");
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::afterUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::pickHighestSignalForN(const hoNDArray<T>& data, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t RO = data.get_size(0);
+        size_t E1 = data.get_size(1);
+        size_t CHA = data.get_size(2);
+        size_t N = data.get_size(3);
+        size_t S = data.get_size(4);
+
+        res.create(RO, E1, CHA, 1, S);
+
+        size_t s;
+        for ( s=0; s<S; s++ )
+        {
+            size_t maxInd=0;
+            typename realType<T>::Type maxNorm;
+
+            hoNDArray<T> data3D(RO, E1, CHA, const_cast<T*>(data.begin()+s*RO*E1*CHA*N));
+            Gadgetron::norm2(data3D, maxNorm);
+
+            size_t n;
+            for ( n=1; n<N; n++ )
+            {
+                data3D.create(RO, E1, CHA, const_cast<T*>(data.begin()+n*RO*E1*CHA+s*RO*E1*CHA*N));
+
+                typename realType<T>::Type currNorm;
+                Gadgetron::norm2(data3D, currNorm);
+
+                if ( maxNorm < currNorm )
+                {
+                    maxNorm = currNorm;
+                    maxInd = n;
+                }
+            }
+
+            memcpy(res.begin()+s*RO*E1*CHA*N, data.begin()+maxInd*RO*E1*CHA+s*RO*E1*CHA*N, sizeof(T)*RO*E1*CHA);
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::pickHighestSignalForN() ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierHandling(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING ) return true;
+
+        if ( workOrder2DT->acceFactorE1_==1 && workOrder2DT->acceFactorE2_==1 )
+        {
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder2DT, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_HOMODYNE )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierHomodyneRecon(*workOrder2DT, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder2DT, workOrder2DT->data_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder2DT, workOrder2DT->data_));
+            }
+        }
+        else if ( workOrder2DT->fullkspace_.get_number_of_elements() > 0 )
+        {
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_HOMODYNE )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierHomodyneRecon(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder2DT, workOrder2DT->fullkspace_));
+            }
+        }
+        else
+        {
+            // perform partial fourier handling on the complex images after coil combination
+            hoNDArray<T> kspace(workOrder2DT->complexIm_);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(workOrder2DT->complexIm_, kspace));
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder2DT, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_HOMODYNE )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierHomodyneRecon(*workOrder2DT, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder2DT, kspace));
+            }
+
+            if ( workOrder2DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder2DT, kspace));
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, workOrder2DT->complexIm_));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performPartialFourierHandling(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_PF_Filter");
+
+        if ( workOrder2DT.filterROE1_partialfourier_.get_size(0)==RO 
+                && workOrder2DT.filterROE1_partialfourier_.get_size(1)==E1 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspace, workOrder2DT.filterROE1_partialfourier_, buffer2DT_partial_fourier_));
+            kspace = buffer2DT_partial_fourier_;
+        }
+
+        else if ( (workOrder2DT.filterRO_partialfourier_.get_number_of_elements() == RO) 
+                && (workOrder2DT.filterE1_partialfourier_.get_number_of_elements() == E1) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspace, workOrder2DT.filterRO_partialfourier_, workOrder2DT.filterE1_partialfourier_, buffer2DT_partial_fourier_));
+            kspace = buffer2DT_partial_fourier_;
+        }
+
+        else
+        {
+            bool filterPerformed = false;
+
+            if ( (workOrder2DT.filterRO_partialfourier_.get_number_of_elements() == RO) 
+                    && (workOrder2DT.filterE1_partialfourier_.get_number_of_elements() != E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(kspace, workOrder2DT.filterRO_partialfourier_, buffer2DT_partial_fourier_));
+                filterPerformed = true;
+            }
+
+            if ( (workOrder2DT.filterRO_partialfourier_.get_number_of_elements() != RO) 
+                    && (workOrder2DT.filterE1_partialfourier_.get_number_of_elements() == E1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(kspace, workOrder2DT.filterE1_partialfourier_, buffer2DT_partial_fourier_));
+                filterPerformed = true;
+            }
+
+            if ( filterPerformed )
+            {
+                kspace = buffer2DT_partial_fourier_;
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_PF_Filter");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierHomodyneRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_homodyne");
+
+        // create kspace filter for homodyne phase estimation
+        ISMRMRDKSPACEFILTER filter_ref_type_ = ISMRMRD_FILTER_HANNING;
+        double filter_ref_sigma_ = 1.5;
+        double filter_ref_width_ = 0.15;
+
+        int startRO(0), endRO(RO-1);
+        hoNDArray<T> filterRO(RO);
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, 0, RO-1, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*RO)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*RO)));
+
+            startRO = workOrder2DT.start_RO_;
+            endRO = workOrder2DT.end_RO_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterRO, "filterRO_homodyne");
+
+        int startE1(0), endE1(E1-1);
+        hoNDArray<T> filterE1(E1);
+        if ( (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, 0, E1-1, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E1)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E1)));
+
+            startE1 = workOrder2DT.start_E1_;
+            endE1 = workOrder2DT.end_E1_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterE1, "filterE1_homodyne");
+
+        hoNDArray<T> kspaceIter(kspace.get_dimensions());
+        kspaceIter = kspace;
+        // store the filtered kspace
+        buffer2DT_partial_fourier_ = kspace;
+        // store the phase images
+        buffer2DT_ = kspace;
+        // magnitude of complex images
+        hoNDArray<typename realType<T>::Type> mag(kspace.get_dimensions());
+        hoNDArray<T> magComplex(kspace.get_dimensions());
+
+        // complex images
+        hoNDArray<T> complexIm(kspace.get_dimensions());
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspace, complexIm));
+
+        hoNDArray<T> complexImPrev(complexIm);
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "homodyne_kspace_beforeIteration");
+
+        size_t ii;
+        for ( ii=0; ii<workOrder2DT.partialFourier_homodyne_iters_; ii++ )
+        {
+            // kspace filter before phase extraction
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspaceIter, filterRO, filterE1, buffer2DT_partial_fourier_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_partial_fourier_, "homodyne_kspaceIter_afterFiltered");
+
+            // go to image domain
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(buffer2DT_partial_fourier_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_partial_fourier_, "homodyne_complexIm");
+
+            // get the phase
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(buffer2DT_partial_fourier_, mag));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::addEpsilon(mag));
+            GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::divide(buffer2DT_partial_fourier_, magComplex, buffer2DT_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_, "homodyne_phase");
+
+            // remove the phase from complex images
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::conjugate(buffer2DT_, buffer2DT_));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(complexIm, buffer2DT_, complexIm));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexIm, "homodyne_complexIm_removePhase");
+
+            // go back to kspace
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(complexIm, kspaceIter));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "homodyne_complexIm_removePhase_kspace");
+
+            // compute threshold to stop the iteration
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(complexImPrev, complexIm, buffer2DT_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_, "homodyne_diff_complexIm");
+
+            typename realType<T>::Type diff, prev;
+            Gadgetron::norm2(complexImPrev, prev);
+            Gadgetron::norm2(buffer2DT_, diff);
+
+            typename realType<T>::Type thres = diff/prev;
+
+            if ( !debugFolder_.empty() )
+            {
+                GADGET_MSG("Homodyne iter : " << ii << " - thres : " << thres << " ... ");
+            }
+
+            if ( thres < workOrder2DT.partialFourier_homodyne_thres_ )
+            {
+                break;
+            }
+
+            complexImPrev = complexIm;
+        }
+
+        // restore the acquired region
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "kspaceIter_after_homodyne_beforeCopy");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_homodyne_beforeCopy");
+
+        if ( workOrder2DT.partialFourier_homodyne_densityComp_ )
+        {
+            size_t width_RO = std::floor(0.1*RO);
+            size_t width_E1 = std::floor(0.1*E1);
+
+            // compute PF filter for RO and E1
+            hoNDArray<T> filterPF_RO, filterPF_E1;
+
+            if ( workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.start_RO_==0 && workOrder2DT.end_RO_==RO-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                    filterPF_RO, ISMRMRD_FILTER_NONE, width_RO, true));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                    filterPF_RO, ISMRMRD_FILTER_TAPERED_HANNING, width_RO, true));
+            }
+
+            if ( workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.start_E1_==0 && workOrder2DT.end_E1_==E1-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                    filterPF_E1, ISMRMRD_FILTER_NONE, width_E1, true));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateAsymmetricFilter(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                    filterPF_E1, ISMRMRD_FILTER_TAPERED_HANNING, width_E1, true));
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterPF_RO, "filterPF_RO_homodyne");
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterPF_E1, "filterPF_E1_homodyne");
+
+            // compensate filter for homodyne filtered kspace
+            hoNDArray<T> filterPF_homodyne_RO(filterPF_RO), filterPF_homodyne_E1(filterPF_E1);
+
+            T midValue = filterPF_RO(RO/2);
+            for ( ii=0; ii<RO; ii++ )
+            {
+                if ( std::abs(filterPF_homodyne_RO(ii)) > std::abs(midValue) )
+                {
+                    filterPF_homodyne_RO(ii) = T(0.0);
+                }
+                else
+                {
+                    filterPF_homodyne_RO(ii) = midValue - filterPF_homodyne_RO(ii);
+                }
+            }
+
+            midValue = filterPF_E1(E1/2);
+            for ( ii=0; ii<E1; ii++ )
+            {
+                if ( std::abs(filterPF_homodyne_E1(ii)) > std::abs(midValue) )
+                {
+                    filterPF_homodyne_E1(ii) = T(0.0);
+                }
+                else
+                {
+                    filterPF_homodyne_E1(ii) = midValue - filterPF_homodyne_E1(ii);
+                }
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterPF_homodyne_RO, "filterPF_homodyne_RO_homodyne");
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterPF_homodyne_E1, "filterPF_homodyne_E1_homodyne");
+
+            T scaleFactor(1.0);
+            hoNDArray<T> filterPF;
+
+            if ( workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.start_RO_==0 && workOrder2DT.end_RO_==RO-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(kspace, filterPF_E1, kspace));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_homodyne_PF_Filter");
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(kspaceIter, filterPF_homodyne_E1, kspaceIter));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "kspaceIter_after_homodyne_PF_Filter");
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::add(filterPF_E1, filterPF_homodyne_E1, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactor));
+            }
+            else if ( workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.start_E1_==0 && workOrder2DT.end_E1_==E1-1) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(kspace, filterPF_RO, kspace));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_homodyne_PF_Filter");
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(kspaceIter, filterPF_homodyne_RO, kspaceIter));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "kspaceIter_after_homodyne_PF_Filter");
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::add(filterPF_RO, filterPF_homodyne_RO, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactor));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspace, filterPF_RO, filterPF_E1, kspace));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_homodyne_PF_Filter");
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspaceIter, filterPF_homodyne_RO, filterPF_homodyne_E1, kspaceIter));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "kspaceIter_after_homodyne_PF_Filter");
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::add(filterPF_RO, filterPF_homodyne_RO, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactor));
+
+                T scaleFactorE1(1.0);
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::add(filterPF_E1, filterPF_homodyne_E1, filterPF));
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeFilterSNRUnitScaleFactor(filterPF, scaleFactorE1));
+
+                scaleFactor *= scaleFactorE1;
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::add(kspace, kspaceIter, kspace));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(scaleFactor, kspace));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1(kspace, kspaceIter, startRO, endRO, startE1, endE1));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "kspaceIter_after_homodyne_afterCopy");
+            kspace = kspaceIter;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_homodyne");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performPartialFourierHomodyneRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierPOCSRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_POCS");
+
+        // create kspace filter for homodyne phase estimation
+        ISMRMRDKSPACEFILTER filter_ref_type_ = ISMRMRD_FILTER_HANNING;
+        double filter_ref_sigma_ = 1.5;
+        double filter_ref_width_ = 0.15;
+
+        int startRO(0), endRO(RO-1);
+        hoNDArray<T> filterRO(RO);
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, 0, RO-1, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*RO)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, workOrder2DT.start_RO_, workOrder2DT.end_RO_, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*RO)));
+
+            startRO = workOrder2DT.start_RO_;
+            endRO = workOrder2DT.end_RO_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterRO, "filterRO_POCS");
+
+        int startE1(0), endE1(E1-1);
+        hoNDArray<T> filterE1(E1);
+        if ( (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, 0, E1-1, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E1)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, workOrder2DT.start_E1_, workOrder2DT.end_E1_, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E1)));
+
+            startE1 = workOrder2DT.start_E1_;
+            endE1 = workOrder2DT.end_E1_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterE1, "filterE1_POCS");
+
+        hoNDArray<T> kspaceIter(kspace);
+        // magnitude of complex images
+        hoNDArray<typename realType<T>::Type> mag(kspace.get_dimensions());
+        hoNDArray<T> magComplex(kspace.get_dimensions());
+
+        // kspace filter
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(kspaceIter, filterRO, filterE1, buffer2DT_partial_fourier_));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_partial_fourier_, "POCS_afterFiltered");
+
+        // go to image domain
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(buffer2DT_partial_fourier_));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_partial_fourier_, "POCS_afterFiltered_complexIm");
+
+        // get the complex image phase for the filtered kspace
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(buffer2DT_partial_fourier_, mag));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::addEpsilon(mag));
+        GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::divide(buffer2DT_partial_fourier_, magComplex, buffer2DT_));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_, "POCS_afterFiltered_complexIm_phase");
+
+        // complex images, initialized as not filtered complex image
+        hoNDArray<T> complexIm(kspaceIter);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspaceIter, complexIm));
+        hoNDArray<T> complexImPOCS(complexIm);
+
+        // the kspace during iteration is buffered here
+        buffer2DT_partial_fourier_kspaceIter_ = kspaceIter;
+
+        size_t ii;
+        for ( ii=0; ii<workOrder2DT.partialFourier_POCS_iters_; ii++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(complexImPOCS, mag));
+            GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(magComplex, buffer2DT_, complexImPOCS));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImPOCS, "POCS_complexImPOCS");
+
+            // go back to kspace
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(complexImPOCS, kspaceIter));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "POCS_kspaceIter");
+
+            // buffer kspace during iteration
+            buffer2DT_partial_fourier_kspaceIter_ = kspaceIter;
+
+            // restore the acquired region
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1(kspace, kspaceIter, startRO, endRO, startE1, endE1));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "POCS_kspaceIter_copyOri");
+
+            // update complex image
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(kspaceIter, complexImPOCS));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImPOCS, "POCS_kspaceIter_copyOri_complexImPOCS");
+
+            // compute threshold to stop the iteration
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(complexImPOCS, complexIm, buffer2DT_partial_fourier_));
+            typename realType<T>::Type diff, prev;
+            Gadgetron::norm2(complexIm, prev);
+            Gadgetron::norm2(buffer2DT_partial_fourier_, diff);
+
+            typename realType<T>::Type thres = diff/prev;
+
+            if ( !debugFolder_.empty() )
+            {
+                GADGET_MSG("POCS iter : " << ii << " - thres : " << thres << " ... ");
+            }
+
+            if ( thres < workOrder2DT.partialFourier_POCS_thres_ )
+            {
+                break;
+            }
+
+            complexIm = complexImPOCS;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_partial_fourier_kspaceIter_, "kspaceIter_after_POCS");
+
+        if ( workOrder2DT.partialFourier_POCS_transitBand_ == 0 )
+        {
+            kspace = kspaceIter;
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1TransitionBand(kspace, buffer2DT_partial_fourier_kspaceIter_, startRO, endRO, startE1, endE1, workOrder2DT.partialFourier_POCS_transitBand_, workOrder2DT.partialFourier_POCS_transitBand_));
+            kspace = buffer2DT_partial_fourier_kspaceIter_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_POCS");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performPartialFourierPOCSRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performPartialFourierFengHuangRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder2DT.start_RO_<0 || workOrder2DT.end_RO_<0 || (workOrder2DT.end_RO_-workOrder2DT.start_RO_+1==RO) ) 
+            && (workOrder2DT.start_E1_<0 || workOrder2DT.end_E1_<0 || (workOrder2DT.end_E1_-workOrder2DT.start_E1_+1==E1) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_FengHuang");
+
+        int startRO(0), endRO(RO-1);
+        if ( workOrder2DT.start_RO_>=0 && workOrder2DT.end_RO_<RO )
+        {
+            startRO = workOrder2DT.start_RO_;
+            endRO = workOrder2DT.end_RO_;
+        }
+
+        int startE1(0), endE1(E1-1);
+        if ( workOrder2DT.start_E1_>=0 && workOrder2DT.end_E1_<E1 )
+        {
+            startE1 = workOrder2DT.start_E1_;
+            endE1 = workOrder2DT.end_E1_;
+        }
+
+        // compute the conjugate symmetric kspace
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("conjugateSymmetry2D"));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().conjugateSymmetry2D(kspace, buffer2DT_));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_, "kspaceConj_FengHuang");
+
+        // find the symmetric region in the kspace
+        size_t startSymRO, endSymRO;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startRO, endRO, RO/2, startSymRO, endSymRO));
+
+        size_t startSymE1, endSymE1;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startE1, endE1, E1/2, startSymE1, endSymE1));
+
+        // the reference kspace for kernel estimation
+        hoNDArray<T> src, dst;
+        std::vector<size_t> start(5), size(5);
+
+        start[0] = startSymRO;
+        start[1] = startSymE1;
+        start[2] = 0;
+        start[3] = 0;
+        start[4] = 0;
+
+        size[0] = endSymRO-startSymRO+1;
+        size[1] = endSymE1-startSymE1+1;
+        size[2] = CHA;
+        size[3] = N;
+        size[4] = S;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo10DArray(buffer2DT_, src, start, size));
+        GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(kspace, dst, start, size));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, src, "src_FengHuang");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, dst, "dst_FengHuang");
+
+        if ( workOrder2DT.partialFourier_FengHuang_sameKernel_allN_ )
+        {
+            hoNDArray<T> ave4D;
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(src, ave4D));
+            src = ave4D;
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace4D(dst, ave4D));
+            dst = ave4D;
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, src, "src_ave4D_FengHuang");
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, dst, "dst_ave4D_FengHuang");
+        }
+
+        // estimate the kernels
+        ho6DArray<T> kernel; // [RO E1 srcCHA dstCHA N S]
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("calibFengHuang"));
+        GADGET_CHECK_RETURN_FALSE(this->calibFengHuang(workOrder2DT, src, dst, kernel));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        // perform the recon
+        if ( workOrder2DT.partialFourier_FengHuang_transitBand_==0 )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performReconFangHuang"));
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder2DT, buffer2DT_, kspace, startRO, endRO, startE1, endE1, kernel));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+        else
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performReconFangHuang with transition band"));
+
+            int tb =  (int)workOrder2DT.partialFourier_FengHuang_transitBand_;
+
+            int sRO(startRO), eRO(endRO), sE1(startE1), eE1(endE1);
+
+            if ( startRO > 0 )
+            {
+                startRO += tb;
+                if ( startRO > RO ) startRO = 0;
+            }
+
+            if ( endRO < RO-1 )
+            {
+                endRO -= tb;
+                if ( endRO < 0 ) endRO = RO-1;
+            }
+
+            if ( startRO > endRO )
+            {
+                startRO = 0;
+                endRO = RO-1;
+            }
+
+            if ( startE1 > 0 )
+            {
+                startE1 += tb;
+                if ( startE1 > E1 ) startE1 = 0;
+            }
+
+            if ( endE1 < E1-1 )
+            {
+                endE1 -= tb;
+                if ( endE1 < 0 ) endE1 = E1-1;
+            }
+
+            if ( startE1 > endE1 )
+            {
+                startE1 = 0;
+                endE1 = E1-1;
+            }
+
+            buffer2DT_partial_fourier_kspaceIter_ = kspace;
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder2DT, buffer2DT_, 
+                    buffer2DT_partial_fourier_kspaceIter_, startRO, endRO, startE1, endE1, kernel));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_partial_fourier_kspaceIter_, "kspace_FengHuang_recon");
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_FengHuang_original");
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1TransitionBand(kspace, buffer2DT_partial_fourier_kspaceIter_, 
+                    sRO, eRO, sE1, eE1, workOrder2DT.partialFourier_FengHuang_transitBand_, workOrder2DT.partialFourier_FengHuang_transitBand_));
+
+            kspace = buffer2DT_partial_fourier_kspaceIter_;
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_FengHuang");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performPartialFourierFengHuangRecon(gtPlusReconWorkOrder2DT<T>& workOrder2DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::calibFengHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(src.dimensions_equal(&dst));
+
+        size_t RO = src.get_size(0);
+        size_t E1 = src.get_size(1);
+        size_t srcCHA = src.get_size(2);
+        size_t N = src.get_size(3);
+        size_t S = src.get_size(4);
+
+        size_t kx = workOrder2DT.partialFourier_FengHuang_kSize_RO_;
+        size_t ky = workOrder2DT.partialFourier_FengHuang_kSize_E1_;
+
+        if ( kx%2 == 0 ) kx++;
+        if ( ky%2 == 0 ) ky++;
+
+        int halfKx = (int)kx/2;
+        int halfKy = (int)ky/2;
+
+        // the cross-channel kernel is not estimated
+        kernel.createArray(kx, ky, srcCHA, 1, N, S);
+
+        int ii=0;
+        int num = N*S*srcCHA;
+
+        size_t startRO = halfKx;
+        size_t endRO = RO - halfKx - 1;
+
+        size_t startE1 = halfKy;
+        size_t endE1 = E1 - halfKy - 1;
+
+        int rowA, colA, rowB, colB;
+        rowA = (endE1-startE1+1)*(endRO-startRO+1); 
+        colA = kx*ky;
+
+        rowB = rowA;
+        colB = 1;
+
+        double thresReg = workOrder2DT.partialFourier_FengHuang_thresReg_;
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, srcCHA, N, S, kx, ky, rowA, colA, rowB, colB, startRO, endRO, startE1, endE1, halfKx, halfKy, thresReg)
+        #else
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, srcCHA, N, S, kx, ky, src, dst, kernel, rowA, colA, rowB, colB, startRO, endRO, startE1, endE1, halfKx, halfKy, thresReg)
+        #endif
+        {
+            hoMatrix<T> A(rowA, colA);
+            T* pA = A.begin();
+
+            hoMatrix<T> B(rowB, colB);
+            T* pB = B.begin();
+
+            hoMatrix<T> K(colA, colB);
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                T* pSrc2D = const_cast<T*>(src.begin())+ii*RO*E1;
+                T* pDst2D = const_cast<T*>(dst.begin())+ii*RO*E1;
+                //ho2DArray<T> src2D(RO, E1, const_cast<T*>(src.begin())+ii*RO*E1);
+                //ho2DArray<T> dst2D(RO, E1, const_cast<T*>(dst.begin())+ii*RO*E1);
+
+                size_t ro, e1, row(0);
+                int x, y;
+
+                for ( e1=startE1; e1<=endE1; e1++ )
+                {
+                    for ( ro=startRO; ro<=endRO; ro++ )
+                    {
+
+                        size_t colInd(0);
+                        for ( y=-halfKy; y<=halfKy; y++ )
+                        {
+                            for ( x=-halfKx; x<=halfKx; x++ )
+                            {
+                                // A(row, colInd++) = src2D(ro+x, e1+y);
+                                pA[row + colInd*rowA] = pSrc2D[ro+x + (e1+y)*RO];
+                                colInd++;
+                            }
+                        }
+
+                        // B(row, 0) = dst2D(ro, e1);
+                        pB[row] = pDst2D[ro + e1*RO];
+
+                        row++;
+                    }
+                }
+
+                Gadgetron::SolveLinearSystem_Tikhonov(A, B, K, thresReg);
+
+                memcpy(kernel.begin()+ii*kx*ky, K.begin(), sizeof(T)*kx*ky);
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::calibFengHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::performReconFangHuang(gtPlusReconWorkOrder2DT<T>& workOrder2DT, 
+                                                const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, 
+                                                int startRO, int endRO, int startE1, int endE1, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspaceConj.dimensions_equal(&kspace));
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t CHA = kspace.get_size(2);
+        size_t N = kspace.get_size(3);
+        size_t S = kspace.get_size(4);
+
+        size_t kx = kernel.get_size(0);
+        size_t ky = kernel.get_size(1);
+
+        int halfKx = kx/2;
+        int halfKy = ky/2;
+        size_t kerN = kernel.get_size(4);
+        GADGET_CHECK_RETURN_FALSE( (kerN==1) || (kerN==N) );
+
+        int num = CHA*N*S;
+
+        int rowD = RO*E1 - ( (endE1-startE1+1) * (endRO-startRO+1) );
+        int colD = kx*ky;
+
+        ho2DArray<size_t> coeffX(rowD, colD);
+        ho2DArray<size_t> coeffY(rowD, colD);
+
+        size_t ro, e1, row(0);
+        int x, y, dx, dy;
+
+        for ( e1=0; e1<E1; e1++ )
+        {
+            for ( ro=0; ro<RO; ro++ )
+            {
+                if ( (ro>=startRO) && (ro<=endRO) && (e1>=startE1) && (e1<=endE1) )
+                {
+                    continue;
+                }
+
+                size_t colInd(0);
+                for ( y=-halfKy; y<=halfKy; y++ )
+                {
+                    dy = e1 + y;
+                    if ( dy < 0 ) dy += E1;
+                    if ( dy > E1-1 ) dy -= E1;
+
+                    for ( x=-halfKx; x<=halfKx; x++ )
+                    {
+                        dx = ro + x;
+                        if ( dx < 0 ) dx += RO;
+                        if ( dx > RO-1 ) dx -= RO;
+
+                        coeffX(row, colInd) = dx;
+                        coeffY(row, colInd) = dy;
+                        colInd++;
+                    }
+                }
+
+                row++;
+            }
+        }
+
+        int ii;
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, CHA, N, S, kerN, rowD, colD, coeffX, coeffY)
+        #else
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, CHA, N, S, kerN, kspaceConj, kspace, kernel, rowD, colD, coeffX, coeffY)
+        #endif
+        {
+            hoMatrix<T> D(rowD, colD);
+            hoMatrix<T> K(colD, 1);
+            hoMatrix<T> R(rowD, 1);
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                ho2DArray<T> src2D(RO, E1, const_cast<T*>(kspaceConj.begin())+ii*RO*E1);
+                ho2DArray<T> dst2D(RO, E1, kspace.begin()+ii*RO*E1);
+
+                size_t ro, e1, row, col;
+                for ( col=0; col<colD; col++ )
+                {
+                    for ( row=0; row<rowD; row++ )
+                    {
+                        D(row, col) = src2D(coeffX(row, col), coeffY(row, col));
+                    }
+                }
+
+                if ( kerN == 1 )
+                {
+                    int ind = ii;
+                    int currS = ind/(CHA*N);
+                    ind %= CHA*N;
+                    int currN = ind/CHA;
+                    ind %= CHA;
+                    memcpy(K.begin(), kernel.begin()+(ind+currS*CHA)*colD, sizeof(T)*colD);
+                }
+                else
+                {
+                    memcpy(K.begin(), kernel.begin()+ii*colD, sizeof(T)*colD);
+                }
+
+                // R = D*K
+                Gadgetron::GeneralMatrixProduct_gemm(R, D, false, K, false);
+
+                for ( row=0; row<rowD; row++ )
+                {
+                    dst2D( coeffX(row, colD/2), coeffY(row, colD/2) ) = R(row, 0);
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::performReconFangHuang(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DT<T>::
+estimateJobSize(gtPlusReconWorkOrder<T>* workOrder2DT, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize)
+{
+    try
+    {
+        size_t nodeN = numOfNodes;
+        GADGET_CHECK_RETURN_FALSE(this->computeEffectiveNodeNumberBasedOnComputingPowerIndex(workOrder2DT, nodeN));
+        if ( workOrder2DT->job_perform_on_control_node_ ) nodeN++;
+
+        GADGET_MSG("GtPlus Cloud 2DT - job_perform_on_control_node is " << workOrder2DT->job_perform_on_control_node_  << " - nodeN is " << nodeN << " - overlapBetweenJobs is " << overlapBetweenJobs << " ... ");
+
+        // adjust jobN according to cloud size
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+
+        size_t totalJobNum = N;
+        jobSize = std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+
+        size_t numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobSize + 2*RO*E1*srcCHA*jobSize );
+
+        // here a 64Mb graceful size is given to job
+        while ( numOfBytesPerJob > maxNumOfBytesPerJob*1024*1024*1024-64.0*1024*1024 )
+        {
+            nodeN *= 2;
+            jobSize = std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+            numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobSize + 2*RO*E1*srcCHA*jobSize );
+        }
+
+        GADGET_MSG("GtPlus Cloud 2DT - jobSize is " << jobSize << "; every job has " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DT<T>::estimateJobSize(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
new file mode 100644
index 0000000..aa36501
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTGRAPPA.h
@@ -0,0 +1,316 @@
+/** \file   gtPlusISMRMRDReconWorker2DTGRAPPA.h
+    \brief  Implement the 2DT GRAPPA reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker2DT.h"
+#include "gtPlusGRAPPA.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTGRAPPA : public gtPlusReconWorker2DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DT<T> BaseClass;
+
+    gtPlusReconWorker2DTGRAPPA() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTGRAPPA() {}
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusGRAPPA<T> grappa_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTGRAPPA<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        std::vector<int> kE1, oE1;
+        bool fitItself = true;
+        GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE1, oE1, workOrder2DT->acceFactorE1_, workOrder2DT->grappa_kSize_E1_, fitItself));
+
+        size_t kRO = workOrder2DT->grappa_kSize_RO_;
+        size_t kNE1 = workOrder2DT->grappa_kSize_E1_;
+        size_t oNE1 = oE1.size();
+
+        workOrder2DT->kernel_->create(kRO, kNE1, srcCHA, dstCHA, oNE1, refN, S);
+        workOrder2DT->kernelIm_->create(RO, E1, srcCHA, dstCHA, refN, S);
+        workOrder2DT->unmixingCoeffIm_->create(RO, E1, srcCHA, refN, S);
+        workOrder2DT->gfactor_.create(RO, E1, refN, S);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTGRAPPA<T>::performCalibPrep(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTGRAPPA<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        std::vector<int> kE1, oE1;
+        bool fitItself = true;
+        GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE1, oE1, workOrder2DT->acceFactorE1_, workOrder2DT->grappa_kSize_E1_, fitItself));
+
+        size_t kRO = workOrder2DT->grappa_kSize_RO_;
+        size_t kNE1 = workOrder2DT->grappa_kSize_E1_;
+        size_t oNE1 = oE1.size();
+
+        ho3DArray<T> acsSrc(refRO, refE1, srcCHA, const_cast<T*>(ref_src.begin()+n*refRO*refE1*srcCHA+usedS*refRO*refE1*srcCHA*refN));
+        ho3DArray<T> acsDst(refRO, refE1, dstCHA, const_cast<T*>(ref_dst.begin()+n*refRO*refE1*dstCHA+usedS*refRO*refE1*dstCHA*refN));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsSrc, "acsSrc");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsDst, "acsDst");
+
+        grappa_.calib_use_gpu_  = workOrder2DT->grappa_use_gpu_;
+
+        ho5DArray<T> ker(kRO, kNE1, srcCHA, dstCHA, oNE1, workOrder2DT->kernel_->begin()+n*kRO*kNE1*srcCHA*dstCHA*oNE1+usedS*kRO*kNE1*srcCHA*dstCHA*oNE1*refN);
+        grappa_.calib(acsSrc, acsDst, workOrder2DT->grappa_reg_lamda_, kRO, kE1, oE1, ker);
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ker, "ker");
+
+        hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+n*RO*E1*srcCHA*dstCHA+usedS*RO*E1*srcCHA*dstCHA*refN);
+        grappa_.imageDomainKernel(ker, kRO, kE1, oE1, RO, E1, kIm);
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kIm, "kIm");
+
+        hoNDArray<T> coilMap(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+n*RO*E1*dstCHA+usedS*RO*E1*dstCHA*refN);
+        hoNDArray<T> unmixC(RO, E1, srcCHA, workOrder2DT->unmixingCoeffIm_->begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*refN);
+        hoNDArray<T> gFactor(RO, E1, workOrder2DT->gfactor_.begin()+n*RO*E1+usedS*RO*E1*refN);
+
+        this->unmixCoeff(kIm, coilMap, unmixC, gFactor);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(1.0/workOrder2DT->acceFactorE1_, gFactor));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unmixC, "unmixC");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, gFactor, "gFactor");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTGRAPPA<T>::performCalibImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTGRAPPA<T>::
+performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        int n;
+
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+
+        size_t refN = workOrder2DT->kernelIm_->get_size(4);
+
+        workOrder2DT->complexIm_.create(RO, E1, N, S);
+
+        if ( workOrder2DT->downstream_coil_compression_ )
+        {
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->data_, buffer2DT_);
+        }
+        else
+        {
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(data_dst, buffer2DT_);
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer2DT_, "buffer2DT_");
+
+        bool recon_kspace = false;
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder2DT->embedded_fullres_coilmap_ || workOrder2DT->embedded_ref_fillback_ )
+            {
+                recon_kspace = true;
+            }
+        }
+
+        if ( workOrder2DT->CalibMode_ == ISMRMRD_separate )
+        {
+            if ( workOrder2DT->separate_fullres_coilmap_ )
+            {
+                recon_kspace = true;
+            }
+        }
+
+        if ( workOrder2DT->recon_kspace_needed_ )
+        {
+            recon_kspace = true;
+        }
+
+        // if kspace is actually needed
+        if ( recon_kspace )
+        {
+            workOrder2DT->fullkspace_ = data_dst;
+
+            buffer2DT_unwrapping_.create(RO, E1, srcCHA, dstCHA);
+
+            size_t usedS;
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+usedS*RO*E1*srcCHA*dstCHA*refN);
+                    hoNDArray<T> aliasedIm(RO, E1, srcCHA, N, buffer2DT_.begin()+usedS*RO*E1*srcCHA*N);
+                    hoNDArray<T> unwarppedIm(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+
+                    this->applyImageDomainKernelImage(aliasedIm, kIm, buffer2DT_unwrapping_, unwarppedIm);
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedIm, "unwarppedIm");
+                }
+                else
+                {
+                    #pragma omp parallel private(n)
+                    {
+                        hoNDArray<T> complexIm(RO, E1, dstCHA);
+
+                        #pragma omp for
+                        for ( n=0; n<(int)N; n++ )
+                        {
+                            hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+n*RO*E1*srcCHA*dstCHA+usedS*RO*E1*srcCHA*dstCHA*refN);
+
+                            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kIm, "kIm_n");
+
+                            T* pIm2D = buffer2DT_.begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*N;
+                            hoNDArray<T> aliasedIm(RO, E1, srcCHA, pIm2D);
+
+                            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aliasedIm, "aliasedIm_n");
+
+                            this->applyImageDomainKernelImage(aliasedIm, kIm, complexIm);
+                            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexIm, "complexIm_n");
+
+                            memcpy(workOrder2DT->fullkspace_.begin()+n*RO*E1*dstCHA+usedS*RO*E1*dstCHA*N, complexIm.begin(), sizeof(T)*RO*E1*dstCHA);
+                        }
+                    }
+                }
+
+                hoNDArray<T> unwarppedIm(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+                hoNDArray<T> combined(RO, E1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+                if ( refN == N )
+                {
+                    hoNDArray<T> coilMap(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+                }
+                else
+                {
+                    hoNDArray<T> coilMap(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                    gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+                }
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, combined, "combined");
+            }
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft2c(workOrder2DT->fullkspace_);
+        }
+        else
+        {
+            size_t usedS;
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> unmixCoeff(RO, E1, srcCHA, workOrder2DT->unmixingCoeffIm_->begin()+usedS*RO*E1*srcCHA*refN);
+                    hoNDArray<T> aliasedIm(RO, E1, srcCHA, N, buffer2DT_.begin()+usedS*RO*E1*srcCHA*N);
+                    hoNDArray<T> unwarppedIm(RO, E1, 1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+                    this->applyUnmixCoeffImage(aliasedIm, unmixCoeff, unwarppedIm);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedIm, "unwarppedIm");
+                }
+                else
+                {
+                    // #pragma omp parallel for private(n)
+                    for ( n=0; n<(int)N; n++ )
+                    {
+                        hoNDArray<T> unmixCoeff(RO, E1, srcCHA, workOrder2DT->unmixingCoeffIm_->begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*refN);
+                        hoNDArray<T> aliasedIm(RO, E1, srcCHA, buffer2DT_.begin()+n*RO*E1*srcCHA+usedS*RO*E1*srcCHA*N);
+                        hoNDArray<T> unwarppedIm(RO, E1, 1, workOrder2DT->complexIm_.begin()+n*RO*E1+usedS*RO*E1*N);
+
+                        this->applyUnmixCoeffImage(aliasedIm, unmixCoeff, unwarppedIm);
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedIm, "unwarppedIm");
+                    }
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTGRAPPA<T>::performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
new file mode 100644
index 0000000..dc62a69
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
@@ -0,0 +1,348 @@
+/** \file   gtPlusISMRMRDReconWorker2DTL1SPIRITNCG.h
+    \brief  Implement the 2DT non-linear SPIRIT reconstruction using the non-linear CG solver
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker2DTSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusNCGSolver.h"
+#include "gtPlusWavelet2DOperator.h"
+#include "gtPlusWavelet3DOperator.h"
+#include "gtPlusWaveletNoNullSpace2DOperator.h"
+#include "gtPlusWaveletNoNullSpace3DOperator.h"
+#include "gtPlusDataFidelityOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTL1SPIRITNCG : public gtPlusReconWorker2DTSPIRIT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DTSPIRIT<T> BaseClass;
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker2DTL1SPIRITNCG() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTL1SPIRITNCG() {}
+
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s);
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+    // virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    using BaseClass::spirit_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTL1SPIRITNCG<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    BaseClass::autoReconParameter(workOrder);
+
+    gtPlusReconWorkOrder2DT<T>* workOrder2DT = dynamic_cast<gtPlusReconWorkOrder2DT<T>*>(workOrder);
+    if ( workOrder2DT == NULL ) return false;
+
+    if ( workOrder2DT->spirit_perform_linear_ )
+    {
+        if ( workOrder2DT->spirit_solve_symmetric_ )
+        {
+            workOrder2DT->spirit_image_reg_lamda_ = 0.0025;
+            workOrder2DT->spirit_ncg_iter_thres_ = 0.0001;
+        }
+        else
+        {
+            workOrder2DT->spirit_image_reg_lamda_ = 0.0025;
+            workOrder2DT->spirit_ncg_iter_thres_ = 0.0001;
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s)
+{
+    try
+    {
+        hoNDArray<T> kspaceLinear(kspace);
+        res = kspace;
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace");
+
+        bool performLinear = workOrder2DT->spirit_perform_linear_;
+        if ( !workOrder2DT->spirit_perform_nonlinear_ ) performLinear = true;
+
+        if ( performLinear )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit linear solver for 2DT ... "));
+            GADGET_CHECK_RETURN_FALSE(BaseClass::performUnwarppingImpl(workOrder2DT, kspace, adj_forward_G_I, kspaceLinear, s));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceLinear, "kspaceLinear");
+
+        if ( workOrder2DT->spirit_perform_nonlinear_ )
+        {
+            size_t refN = adj_forward_G_I.get_size(4);
+
+            size_t RO = kspace.get_size(0);
+            size_t E1 = kspace.get_size(1);
+            size_t N = kspace.get_size(3);
+
+            size_t srcCHA = adj_forward_G_I.get_size(2);
+            size_t dstCHA = adj_forward_G_I.get_size(3);
+
+            if ( workOrder2DT->spirit_2D_scale_per_chunk_ )
+            {
+                typename realType<T>::Type scaleFactor = 1.0;
+                Gadgetron::norm2(kspace, scaleFactor);
+                scaleFactor /= (RO*std::sqrt(double(srcCHA)));
+
+                workOrder2DT->spirit_ncg_scale_factor_ = scaleFactor;
+            }
+
+            // apply the scale
+            Gadgetron::scal( static_cast<value_type>(1.0/workOrder2DT->spirit_ncg_scale_factor_), kspaceLinear);
+            Gadgetron::scal( static_cast<value_type>(1.0/workOrder2DT->spirit_ncg_scale_factor_), kspace);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapS;
+            
+            if ( workOrder2DT->coilMap_ )
+            {
+                if ( refN < N )
+                {
+                    coilMapS = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()));
+                }
+                else
+                {
+                    coilMapS = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+s*RO*E1*dstCHA*refN));
+                }
+            }
+
+            if ( N > 1 )
+            {
+                // 2D+T
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, refN, adj_forward_G_I.begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(RO, E1, srcCHA, N, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder2DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder2DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 1;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder2DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DTOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setMemoryManager(gtPlus_mem_manager_);
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet3DOperator<T> wavNullSpace3DOperator;
+                    wavNullSpace3DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNullSpace3DOperator.setAcquiredPoints(acq);
+                    wavNullSpace3DOperator.scale_factor_first_dimension_ = workOrder2DT->spirit_RO_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_second_dimension_ = workOrder2DT->spirit_E1_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_third_dimension_ = workOrder2DT->spirit_temporal_enhancement_ratio_;
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNullSpace3DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T(workOrder2DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace3DOperator, T(workOrder2DT->spirit_image_reg_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 2DT ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_2DT_res");
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_2DT_res_restored");
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DTOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setMemoryManager(gtPlus_mem_manager_);
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace3DOperator<T> wavNoNullSpace3DOperator;
+                    wavNoNullSpace3DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNoNullSpace3DOperator.setAcquiredPoints(acq);
+                    wavNoNullSpace3DOperator.scale_factor_first_dimension_ = workOrder2DT->spirit_RO_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_second_dimension_ = workOrder2DT->spirit_E1_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_third_dimension_ = workOrder2DT->spirit_temporal_enhancement_ratio_;
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNoNullSpace3DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T(workOrder2DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace3DOperator, T(workOrder2DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, T(workOrder2DT->spirit_data_fidelity_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 2DT without null space ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_2DT_res_noNullSpace");
+                }
+            }
+            else
+            {
+                // 2D
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, adj_forward_G_I.begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(RO, E1, srcCHA, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder2DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder2DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 1;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder2DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setMemoryManager(gtPlus_mem_manager_);
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet2DOperator<T> wavNullSpace2DOperator;
+                    wavNullSpace2DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNullSpace2DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T(workOrder2DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace2DOperator, T(workOrder2DT->spirit_image_reg_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 2D ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_2D_res");
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_2D_res_restored");
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setMemoryManager(gtPlus_mem_manager_);
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace2DOperator<T> wavNoNullSpace2DOperator;
+                    wavNoNullSpace2DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNoNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder2DT->spirit_use_coil_sen_map_ && workOrder2DT->coilMap_ )
+                    {
+                        wavNoNullSpace2DOperator.setCoilSenMap(coilMapS);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T(workOrder2DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace2DOperator, T(workOrder2DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, T(workOrder2DT->spirit_data_fidelity_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 2D without null space ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_2D_res_noNullSpace");
+                }
+            }
+
+            Gadgetron::scal(T(workOrder2DT->spirit_ncg_scale_factor_), res);
+        }
+        else
+        {
+            res = kspaceLinear;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTL1SPIRITNCG<T>::performUnwarppingImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace;
+        hoNDArray<T>& ker = job.ker;
+        hoNDArray<T>& res = job.res;
+        gtPlusReconWorkOrder<T>* workOrder2DT = &(job.workOrder2DT);
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder2DT, kspace, ker, res, job.job_index_S_));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTL1SPIRITNCG<T>::performUnwarppingImpl(job) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
new file mode 100644
index 0000000..68e815f
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTNoAcceleration.h
@@ -0,0 +1,146 @@
+/** \file   gtPlusISMRMRDReconWorker2DTNoAcceleration.h
+    \brief  Implement the 2DT reconstruction without the k-space undersampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+
+#include "GadgetronTimer.h"
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker2DT.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTNoAcceleration : public gtPlusReconWorker2DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DT<T> BaseClass;
+
+    gtPlusReconWorker2DTNoAcceleration() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTNoAcceleration() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder2DT!=NULL);
+
+        if ( !workOrder2DT->workFlow_use_BufferedKernel_ )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("prepRef"));
+            GADGET_CHECK_RETURN_FALSE(prepRef(workOrder2DT, workOrder2DT->ref_, workOrder2DT->ref_recon_, workOrder2DT->ref_coil_map_, 
+                        workOrder2DT->start_RO_, workOrder2DT->end_RO_, workOrder2DT->start_E1_, workOrder2DT->end_E1_, workOrder2DT->data_.get_size(1)));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t CHA = workOrder2DT->data_.get_size(2);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t refN = workOrder2DT->ref_recon_.get_size(3);
+        size_t usedS;
+
+        // apply coil compression coefficients
+        if ( !workOrder2DT->workFlow_use_BufferedKernel_ 
+                    || (workOrder2DT->coilMap_->get_size(0)!=RO) 
+                    || (workOrder2DT->coilMap_->get_size(1)!=E1)
+                    || (workOrder2DT->coilMap_->get_size(4)!=S) )
+        {
+            workOrder2DT->coilMap_->create(RO, E1, CHA, refN, S);
+
+            // estimate the coil sensitivity
+            if ( workOrder2DT->no_acceleration_same_combinationcoeff_allS_ )
+            {
+                usedS = workOrder2DT->no_acceleration_whichS_combinationcoeff_;
+                if ( usedS >= S ) usedS = S-1;
+
+                hoNDArray<T> refCoilMapS(RO, E1, refN, workOrder2DT->ref_coil_map_.begin()+usedS*RO*E1*refN);
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(refCoilMapS, buffer2DT_));
+
+                hoNDArray<T> coilMapS(RO, E1, CHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*CHA*refN);
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        coilMapS, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder2DT->coilMap_, usedS));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->ref_coil_map_, buffer2DT_));
+
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap2DNIH(buffer2DT_, 
+                        *workOrder2DT->coilMap_, workOrder2DT->coil_map_algorithm_, workOrder2DT->csm_kSize_, workOrder2DT->csm_powermethod_num_, workOrder2DT->csm_iter_num_, workOrder2DT->csm_iter_thres_, workOrder2DT->csm_use_gpu_));
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder2DT->coilMap_, "coilMap_");
+        }
+
+        // partial fourier handling
+        GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder2DT));
+
+        workOrder2DT->complexIm_.create(RO, E1, N, S);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("perform coil combination"));
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(workOrder2DT->data_, buffer2DT_);
+        for ( usedS=0; usedS<S; usedS++ )
+        {
+            hoNDArray<T> unwarppedIm(RO, E1, CHA, N, buffer2DT_.begin()+usedS*RO*E1*CHA*N);
+            hoNDArray<T> combined(RO, E1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+            if ( refN == N )
+            {
+                hoNDArray<T> coilMap(RO, E1, CHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*CHA*refN);
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+            }
+            else
+            {
+                hoNDArray<T> coilMap(RO, E1, CHA, workOrder2DT->coilMap_->begin()+usedS*RO*E1*CHA*refN);
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine(unwarppedIm, coilMap, combined);
+            }
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->complexIm_, "combined");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder2DT<T>* workOrder2DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h
new file mode 100644
index 0000000..05e2052
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker2DTSPIRIT.h
@@ -0,0 +1,685 @@
+/** \file   gtPlusISMRMRDReconWorker2DTSPIRIT.h
+    \brief  Implement the 2DT linear SPIRIT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker2DT.h"
+#include "gtPlusSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusLSQRSolver.h"
+
+#include "GadgetCloudController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker2DTSPIRIT : public gtPlusReconWorker2DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker2DT<T> BaseClass;
+    typedef typename realType<T>::Type value_type;
+
+    gtPlusReconWorker2DTSPIRIT() : BaseClass() {}
+    virtual ~gtPlusReconWorker2DTSPIRIT() {}
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS);
+
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s);
+    virtual bool performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+    using BaseClass::buffer2DT_;
+    using BaseClass::buffer2DT_unwrapping_;
+    using BaseClass::buffer2DT_partial_fourier_;
+    using BaseClass::buffer2DT_partial_fourier_kspaceIter_;
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusSPIRIT<T> spirit_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        size_t kRO = workOrder2DT->spirit_kSize_RO_;
+        size_t kE1 = workOrder2DT->spirit_kSize_E1_;
+
+        workOrder2DT->kernel_->create(kRO, kE1, srcCHA, dstCHA, 1, 1, refN, S);
+        workOrder2DT->kernelIm_->create(RO, E1, srcCHA, dstCHA, refN, S);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTSPIRIT<T>::performCalibPrep(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    gtPlusReconWorkOrder2DT<T>* workOrder2DT = dynamic_cast<gtPlusReconWorkOrder2DT<T>*>(workOrder);
+    if ( workOrder2DT == NULL ) return false;
+
+    double maxAcceFactor = workOrder2DT->acceFactorE1_;
+
+    if ( maxAcceFactor>=6 )
+    {
+        workOrder2DT->spirit_iter_max_ = 150;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else if ( maxAcceFactor>=5 )
+    {
+        workOrder2DT->spirit_iter_max_ = 120;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else if ( maxAcceFactor>=4 )
+    {
+        workOrder2DT->spirit_iter_max_ = 100;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else if ( maxAcceFactor>=3 )
+    {
+        workOrder2DT->spirit_iter_max_ = 60;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+    else
+    {
+        workOrder2DT->spirit_iter_max_ = 50;
+        workOrder2DT->spirit_iter_thres_ = 0.0015;
+        workOrder2DT->spirit_reg_lamda_ = 0.005;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, gtPlusReconWorkOrder2DT<T>* workOrder2DT, size_t n, size_t usedS)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = ref_src.get_size(2);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refN = ref_dst.get_size(3);
+        size_t dstCHA = ref_dst.get_size(2);
+
+        size_t kRO = workOrder2DT->spirit_kSize_RO_;
+        size_t kE1 = workOrder2DT->spirit_kSize_E1_;
+
+        ho3DArray<T> acsSrc(refRO, refE1, srcCHA, const_cast<T*>(ref_src.begin()+n*refRO*refE1*srcCHA+usedS*refRO*refE1*srcCHA*refN));
+        ho3DArray<T> acsDst(refRO, refE1, dstCHA, const_cast<T*>(ref_dst.begin()+n*refRO*refE1*dstCHA+usedS*refRO*refE1*dstCHA*refN));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsSrc, "acsSrc");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsDst, "acsDst");
+
+        ho6DArray<T> ker(kRO, kE1, srcCHA, dstCHA, 1, 1, 
+                            workOrder2DT->kernel_->begin()
+                            +n*kRO*kE1*srcCHA*dstCHA
+                            +usedS*kRO*kE1*srcCHA*dstCHA*refN);
+
+        gtPlusSPIRIT2DOperator<T> spirit;
+        spirit.setMemoryManager(gtPlus_mem_manager_);
+
+        spirit.calib_use_gpu_ = workOrder2DT->spirit_use_gpu_;
+
+        spirit.calib(acsSrc, acsDst, workOrder2DT->spirit_reg_lamda_, kRO, kE1, 1, 1, ker);
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ker, "ker");
+
+        bool minusI = true;
+
+        hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, workOrder2DT->kernelIm_->begin()+n*RO*E1*srcCHA*dstCHA+usedS*RO*E1*srcCHA*dstCHA*refN);
+        GADGET_CHECK_RETURN_FALSE(spirit.imageDomainKernel(ker, kRO, kE1, 1, 1, RO, E1, kIm, minusI));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kIm, "kIm");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTSPIRIT<T>::performCalibImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder2DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t s)
+{
+    try
+    {
+        size_t refN = adj_forward_G_I.get_size(4);
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t N = kspace.get_size(3);
+
+        size_t srcCHA = adj_forward_G_I.get_size(2);
+        size_t dstCHA = adj_forward_G_I.get_size(3);
+
+        res.create(kspace.get_dimensions());
+
+        int n;
+
+        #ifdef USE_OMP
+            int numThreads = (N<64) ? N : 64;
+
+            int numOpenMPProcs = omp_get_num_procs();
+            GADGET_MSG("gtPlusReconWorker2DTSPIRIT, numOpenMPProcs : " << numOpenMPProcs);
+
+            int maxOpenMPThreads = omp_get_max_threads();
+            GADGET_MSG("gtPlusReconWorker2DTSPIRIT, maxOpenMPThreads : " << maxOpenMPThreads);
+
+            int allowOpenMPNested = omp_get_nested();
+
+            if ( N < numOpenMPProcs-2 )
+            {
+                omp_set_nested(1);
+                allowOpenMPNested = 1;
+            }
+            else
+            {
+                omp_set_nested(0);
+                allowOpenMPNested = 0;
+            }
+
+            GADGET_MSG("gtPlusReconWorker2DTSPIRIT, allowOpenMPNested : " << allowOpenMPNested);
+            GADGET_MSG("gtPlusReconWorker2DTSPIRIT, numThreads : " << numThreads);
+        #endif
+
+        GADGET_MSG("gtPlusReconWorker2DTSPIRIT, processing starts ... ");
+
+        hoNDArray<T> ker_Shifted(adj_forward_G_I);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(adj_forward_G_I, ker_Shifted);
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ker_Shifted, "ker_Shifted");
+
+        hoNDArray<T> kspace_Shifted(kspace);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(kspace, kspace_Shifted);
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace_Shifted, "kspace_Shifted");
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(n) shared(RO, E1, srcCHA, dstCHA, kspace_Shifted, ker_Shifted, workOrder2DT, refN, N) num_threads(numThreads)
+        #else
+            #pragma omp parallel default(none) private(n) shared(RO, E1, srcCHA, dstCHA, kspace_Shifted, ker_Shifted, workOrder2DT, res, refN, N) num_threads(numThreads)
+        #endif
+        {
+            gtPlusSPIRIT2DOperator<T> spirit;
+            // spirit.setMemoryManager(gtPlus_mem_manager_);
+            spirit.use_symmetric_spirit_ = false;
+            spirit.use_non_centered_fft_ = true;
+
+            if ( refN == 1 )
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, ker_Shifted.begin()));
+                spirit.setForwardKernel(ker, false);
+            }
+
+            gtPlusLSQRSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> > cgSolver;
+
+            cgSolver.iterMax_ = workOrder2DT->spirit_iter_max_;
+            cgSolver.thres_ = workOrder2DT->spirit_iter_thres_;
+            cgSolver.printIter_ = workOrder2DT->spirit_print_iter_;
+
+            cgSolver.set(spirit);
+
+            hoNDArray<T> b(RO, E1, srcCHA);
+
+            #pragma omp for
+            for ( n=0; n<(int)N; n++ )
+            {
+                hoNDArray<T> unwarppedKSpace(RO, E1, dstCHA, res.begin()+n*RO*E1*dstCHA);
+
+                int kernelN = n;
+                if ( kernelN >= refN ) kernelN = refN-1;
+
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(RO, E1, srcCHA, kspace_Shifted.begin()+n*RO*E1*srcCHA));
+                spirit.setAcquiredPoints(acq);
+
+                cgSolver.x0_ = acq.get();
+
+                if ( refN > 1 )
+                {
+                    boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(RO, E1, srcCHA, dstCHA, ker_Shifted.begin()+kernelN*RO*E1*srcCHA*dstCHA));
+                    spirit.setForwardKernel(ker, false);
+
+                    // compute rhs
+                    spirit.computeRighHandSide(*acq, b);
+
+                    // solve
+                    cgSolver.solve(b, unwarppedKSpace);
+                }
+                else
+                {
+                    // compute rhs
+                    spirit.computeRighHandSide(*acq, b);
+
+                    // solve
+                    cgSolver.solve(b, unwarppedKSpace);
+                }
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedKSpace, "unwarppedKSpace_n");
+
+                // restore the acquired points
+                spirit.restoreAcquiredKSpace(*acq, unwarppedKSpace);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedKSpace, "unwarppedKSpace_n_setAcq");
+            }
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "res_Shifted");
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fftshift2D(res, kspace_Shifted);
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace_Shifted, "res");
+        res = kspace_Shifted;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTSPIRIT<T>::performUnwarppingImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        size_t RO = workOrder2DT->data_.get_size(0);
+        size_t E1 = workOrder2DT->data_.get_size(1);
+        size_t N = workOrder2DT->data_.get_size(3);
+        size_t S = workOrder2DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder2DT->kernelIm_->get_size(2);
+        size_t dstCHA = workOrder2DT->kernelIm_->get_size(3);
+
+        size_t refN = workOrder2DT->kernelIm_->get_size(4);
+
+        size_t usedS;
+
+        // compute the scaling factor
+        typename realType<T>::Type scaleFactor = 1.0;
+        int numOfNForScaling = 100;
+        if ( N > numOfNForScaling )
+        {
+            hoNDArray<T> kspaceForScaleFactor(RO, E1, srcCHA, numOfNForScaling, const_cast<T*>(data_dst.begin()));
+            Gadgetron::norm2(kspaceForScaleFactor, scaleFactor);
+            scaleFactor /= (numOfNForScaling*std::sqrt(double(srcCHA)));
+        }
+        else
+        {
+            Gadgetron::norm2(data_dst, scaleFactor);
+            scaleFactor /= (N*std::sqrt(double(srcCHA)));
+        }
+
+        workOrder2DT->spirit_ncg_scale_factor_ = scaleFactor;
+
+        // split the jobs
+        bool splitByS = workOrder2DT->job_split_by_S_;
+        size_t jobN = workOrder2DT->job_num_of_N_;
+        size_t jobMegaBytes = workOrder2DT->job_max_Megabytes_;
+        size_t overlapN = workOrder2DT->job_overlap_;
+        size_t maxNumOfBytesPerJob = jobMegaBytes*1024*1024;
+
+        if ( workOrder2DT->recon_algorithm_==ISMRMRD_SPIRIT )
+        {
+            overlapN = 0;
+        }
+
+        bool splitJobs = (splitByS==true || jobN>0);
+        if ( !splitJobs )
+        {
+            if ( jobMegaBytes>0 )
+            {
+                size_t jobN = jobMegaBytes/(RO*E1*srcCHA*dstCHA*sizeof(T)/1024/1024);
+                if ( jobN < N ) splitJobs = true;
+                GADGET_MSG("SPIRIT - 2DT - size of largest job : " << jobN);
+            }
+        }
+
+        if ( !workOrder2DT->CloudComputing_ )
+        {
+            if ( jobN >= N ) splitJobs = false;
+        }
+
+        if ( splitJobs )
+        {
+            bool runJobsOnCloud = workOrder2DT->CloudComputing_;
+            unsigned int cloudSize = workOrder2DT->CloudSize_;
+            bool runJobsOnLocalNode = workOrder2DT->job_perform_on_control_node_;
+
+            std::vector<gtPlusReconJob2DT<T> > jobList;
+
+            if ( runJobsOnCloud )
+            {
+                unsigned int j;
+
+                GADGET_CHECK_RETURN_FALSE(this->estimateJobSize(workOrder2DT, maxNumOfBytesPerJob, overlapN, cloudSize, jobN));
+
+                //GADGET_MSG("SPIRIT - 2DT - cloudSize is " << cloudSize << " - N is " << N << " ... ");
+                //unsigned int nodeN = cloudSize;
+                //if ( runJobsOnLocalNode ) nodeN++;
+                //GADGET_MSG("SPIRIT - 2DT - runJobsOnLocalNode is " << runJobsOnLocalNode << " - nodeN is " << nodeN << " - overlapN is " << overlapN << " ... ");
+
+                //// adjust jobN according to cloud size
+                //jobN = std::ceil( (double)(N+overlapN*(nodeN-1))/(double)nodeN );
+
+                //size_t numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobN + 2*RO*E1*srcCHA*jobN );
+
+                //while ( numOfBytesPerJob > 2.0*1024*1024*1024-64.0*1024*1024 )
+                //{
+                //    nodeN *= 2;
+                //    jobN = std::ceil( (double)N/nodeN + (double)(overlapN*(nodeN-1))/nodeN );
+                //    numOfBytesPerJob = sizeof(T)*( RO*E1*srcCHA*dstCHA*jobN + 2*RO*E1*srcCHA*jobN );
+                //}
+
+                //GADGET_MSG("SPIRIT - 2DT - jobN is " << jobN << "; every job has " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+
+                // split the job
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(workOrder2DT, const_cast<hoNDArray<T>&>(data_dst), *(workOrder2DT->kernelIm_), splitByS, jobN, jobMegaBytes, overlapN, jobList));
+
+                if ( runJobsOnLocalNode )
+                {
+                    while ( jobList.size() <= cloudSize )
+                    {
+                        jobN--;
+                        jobList.clear();
+                        GADGET_CHECK_RETURN_FALSE(this->splitReconJob(workOrder2DT, const_cast<hoNDArray<T>&>(data_dst), *(workOrder2DT->kernelIm_), splitByS, jobN, jobMegaBytes, overlapN, jobList));
+                    }
+                }
+
+                std::vector<gtPlusReconJob2DT<T> > completedJobList(jobList.size());
+
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    jobList[j].workOrder2DT.duplicate(completedJobList[j].workOrder2DT);
+                    completedJobList[j].job_index_startN_ = jobList[j].job_index_startN_;
+                    completedJobList[j].job_index_endN_ = jobList[j].job_index_endN_;
+                    completedJobList[j].job_index_S_ = jobList[j].job_index_S_;
+                }
+
+                GADGET_MSG("SPIRIT - 2DT - total job : " << jobList.size() << " - job N : " << jobN << " - cloud size : " << cloudSize);
+
+                unsigned int numOfJobRunOnCloud = jobList.size() - jobList.size()/(cloudSize+1);
+                if ( !runJobsOnLocalNode ) numOfJobRunOnCloud = jobList.size();
+                GADGET_MSG("SPIRIT - 2DT - numOfJobRunOnCloud : " << numOfJobRunOnCloud << " ... ");
+
+                typedef Gadgetron::GadgetCloudController< gtPlusReconJob2DT<T> > GTCloudControllerType;
+                GTCloudControllerType controller;
+
+                if (controller.open () == -1)
+                {
+                    GADGET_ERROR_MSG("Cloud controller cannot open the cloud ...");
+                    controller.handle_close (ACE_INVALID_HANDLE, 0);
+                    runJobsOnCloud = false;
+                }
+                else
+                {
+                    std::vector<gtPlusReconJob2DT<T>* > jobListCloud(numOfJobRunOnCloud);
+                    std::vector<gtPlusReconJob2DT<T>* > completedJobListCloud(numOfJobRunOnCloud);
+                    std::vector<int> node_ids(numOfJobRunOnCloud);
+
+                    GADGET_CHECK_RETURN_FALSE(this->scheduleJobForNodes(workOrder2DT, numOfJobRunOnCloud, node_ids));
+
+                    for ( j=0; j<numOfJobRunOnCloud; j++ )
+                    {
+                        // node_ids[j] = j%cloudSize;
+                        jobListCloud[j] = &jobList[j];
+                        completedJobListCloud[j] = &completedJobList[j];
+                        GADGET_MSG("--> job " << j << " runs on node " << node_ids[j] << " ... ");
+                    }
+
+                    std::vector<GadgetMessageReader*> readers(cloudSize, NULL);
+                    std::vector<GadgetMessageWriter*> writers(cloudSize, NULL);
+
+                    for ( j=0; j<cloudSize; j++ )
+                    {
+                        readers[j] = new GtPlusCloudJobMessageReaderCPFL();
+                        writers[j] = new GtPlusCloudJobMessageWriterCPFL();
+                    }
+
+                    if ( controller.createConnector(workOrder2DT->gt_cloud_, GADGET_MESSAGE_CLOUD_JOB, readers, GADGET_MESSAGE_CLOUD_JOB, writers) != 0 )
+                    {
+                        GADGET_ERROR_MSG("Cloud controller creates connectors failed ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else if ( controller.connectToCloud(workOrder2DT->gt_cloud_) != 0 )
+                    {
+                        GADGET_ERROR_MSG("Cloud controller cannot connect to the cloud ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else
+                    {
+                        if ( controller.runJobsOnCloud(jobListCloud, completedJobListCloud, node_ids) != 0 )
+                        {
+                            GADGET_ERROR_MSG("Cloud controller runs jobs on the cloud failed ...");
+                            controller.closeCloudNode();
+                            controller.handle_close (ACE_INVALID_HANDLE, 0);
+                            runJobsOnCloud = false;
+                        }
+                        else
+                        {
+                            controller.closeCloudNode();
+
+                            // run the left over jobs on the local computer
+                            for ( j=numOfJobRunOnCloud; j<jobList.size(); j++ )
+                            {
+                                GADGET_MSG("SPIRIT - 2DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 2DT ... "));
+                                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                std::ostringstream ostr;
+                                ostr << "job_fullkspace" << "_" << j;
+                                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, jobList[j].res, ostr.str());
+                            }
+
+                            // wait the cloud job to complete
+                            controller.waitForJobToComplete();
+
+                            // combine results from cloud and local run
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                jobList[j].res = controller.completed_job_list_[j]->res;
+                                jobList[j].complexIm = controller.completed_job_list_[j]->complexIm;
+                            }
+
+                            // if some jobs are not actually completed, process them
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                if ( 
+                                    !jobList[j].res.dimensions_equal(&jobList[j].kspace) 
+                                        && 
+                                    ( jobList[j].complexIm.get_size(0)!= jobList[j].kspace.get_size(0) 
+                                    || jobList[j].complexIm.get_size(1)!= jobList[j].kspace.get_size(1) 
+                                    || jobList[j].complexIm.get_size(2)!= jobList[j].kspace.get_size(2) ) 
+                                   )
+                                {
+                                    GADGET_MSG("SPIRIT - 2DT - uncompleted cloud job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3DT ... "));
+                                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                    std::ostringstream ostr;
+                                    ostr << "job_fullkspace" << "_" << j;
+                                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, jobList[j].res, ostr.str());
+                                }
+                            }
+
+                            // combine the job
+                            GADGET_CHECK_RETURN_FALSE(this->combineReconJob(workOrder2DT, jobList, N, S));
+
+                            // clear the memory
+                            jobList.clear();
+                        }
+                    }
+                }
+            }
+
+            if ( !runJobsOnCloud )
+            {
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(workOrder2DT, const_cast<hoNDArray<T>&>(data_dst), *(workOrder2DT->kernelIm_), splitByS, jobN, jobMegaBytes, overlapN, jobList));
+
+                GADGET_MSG("SPIRIT - 2DT - total job : " << jobList.size());
+
+                size_t j;
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    GADGET_MSG("SPIRIT - 2DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("L1 SPIRIT NCG 2DT ... "));
+                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, jobList[j].res, "job_fullkspace");
+                }
+
+                // combine the job
+                GADGET_CHECK_RETURN_FALSE(this->combineReconJob(workOrder2DT, jobList, N, S));
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder2DT->fullkspace_, "fullkspace");
+
+                // clear the memory
+                jobList.clear();
+            }
+        }
+        else
+        {
+            workOrder2DT->complexIm_.create(RO, E1, N, S);
+
+            // downstream coil compression is not supported here
+            // kspace is always reconed
+            bool recon_kspace = true;
+
+            workOrder2DT->fullkspace_ = data_dst;
+
+            for ( usedS=0; usedS<S; usedS++ )
+            {
+                hoNDArray<T> kIm(RO, E1, srcCHA, dstCHA, refN, workOrder2DT->kernelIm_->begin()+usedS*RO*E1*srcCHA*dstCHA*refN);
+
+                hoNDArray<T> aliasedKSpace(RO, E1, srcCHA, N, const_cast<T*>(data_dst.begin())+usedS*RO*E1*srcCHA*N);
+
+                hoNDArray<T> unwarppedKSpace(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+
+                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder2DT, aliasedKSpace, kIm, unwarppedKSpace, usedS));
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedKSpace, "unwarppedKSpace");
+            }
+        }
+
+        hoNDArrayMemoryManaged<T> complexImMultiChannel(RO, E1, dstCHA, N, gtPlus_mem_manager_);
+
+        // perform coil combination
+        for ( usedS=0; usedS<S; usedS++ )
+        {
+            hoNDArray<T> unwarppedKSpace(RO, E1, dstCHA, N, workOrder2DT->fullkspace_.begin()+usedS*RO*E1*dstCHA*N);
+
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft2c(unwarppedKSpace, complexImMultiChannel);
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImMultiChannel, "unwarppedComplexIm");
+
+            hoNDArray<T> combined(RO, E1, N, workOrder2DT->complexIm_.begin()+usedS*RO*E1*N);
+
+            if ( refN == N )
+            {
+                hoNDArray<T> coilMap(RO, E1, dstCHA, refN, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImMultiChannel, coilMap, combined);
+            }
+            else
+            {
+                hoNDArray<T> coilMap(RO, E1, dstCHA, workOrder2DT->coilMap_->begin()+usedS*RO*E1*dstCHA*refN);
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine(complexImMultiChannel, coilMap, combined);
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, combined, "combined");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTSPIRIT<T>::performUnwrapping(gtPlusReconWorkOrder2DT<T>* workOrder2DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker2DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace;
+        hoNDArray<T>& ker = job.ker;
+        hoNDArray<T>& res = job.res;
+        gtPlusReconWorkOrder<T>* workOrder2DT = &(job.workOrder2DT);
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder2DT, kspace, ker, res, job.job_index_S_));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker2DTSPIRIT<T>::performUnwarppingImpl(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h
new file mode 100644
index 0000000..b999e21
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DT.h
@@ -0,0 +1,2749 @@
+/** \file   gtPlusISMRMRDReconWorker3DT.h
+    \brief  Define the base class for the GtPlus worker for 3D or 3DT reconstruction cases
+
+            Four different strategies were implemented for partial fourier or asymmetric echo acquisition, including:
+
+            ISMRMRD_PF_ZEROFILLING          : only zero filling the unacquired k-space
+
+            ISMRMRD_PF_ZEROFILLING_FILTER   : zero filling the unacquired k-space and apply a transition filter on the edges between
+                                              acquired and unacquired regions
+
+            ISMRMRD_PF_POCS                 : perform the iterative POCS reconstruction
+                                              Magnetic Resonance Imaging: Physical Principles and Sequence Design. Page 296-297.
+                                              E. Mark Haacke, Robert W. Brown, Michael R. Thompson, Ramesh Venkatesan. 
+                                              Wiley-Liss, ISBN-10: 0471351288.
+
+            ISMRMRD_PF_FENGHUANG            : perform a k-space convolution based partial fourier reconstruction. 
+
+                                              Feng Huang, Wei Lin, and Yu Li. 
+                                              Partial Fourier Reconstruction Through Data Fitting and Convolution in k-Space.
+                                              Magnetic Resonance in Medicine, Vol 62, page 1261�1269, 2009.
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DT : public gtPlusReconWorker<T>
+{
+public:
+
+    typedef gtPlusReconWorker<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+    typedef typename BaseClass::value_type value_type;
+
+    gtPlusReconWorker3DT() : BaseClass(), startE1_(0), endE1_(0), startE2_(0), endE2_(0) {}
+    virtual ~gtPlusReconWorker3DT() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder<T>* workOrder)
+    {
+        // check whether we have all-zeros input
+        value_type v(1);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::norm2(workOrder->data_, v));
+        if ( v <= 0 )
+        {
+            GADGET_WARN_MSG("gtPlusReconWorker2DT, performRecon(workOrder) : incoming data contains all-zeros ... ");
+
+            boost::shared_ptr< std::vector<size_t> > dims = workOrder->data_.get_dimensions();
+            (*dims)[3] = workOrder->num_channels_res_;
+            workOrder->complexIm_.create(dims);
+            Gadgetron::clear(workOrder->complexIm_);
+
+            return true;
+        }
+
+        gtPlusReconWorkOrder3DT<T>* workOrder3DT = dynamic_cast<gtPlusReconWorkOrder3DT<T>*>(workOrder);
+        if ( workOrder3DT == NULL ) return false;
+
+        if ( workOrder3DT->recon_auto_parameters_ )
+        {
+            this->autoReconParameter(workOrder3DT);
+            GADGET_MSG("Gt Plus 3DT -- automatic paramter selection ---");
+            workOrder3DT->print(std::cout);
+        }
+
+        return this->performRecon(workOrder3DT);
+    }
+
+    // the common functionalities are performed here for 3DT recon
+    // compute the coil compression coefficients
+    // prepare the ref data array
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool estimateCoilMap(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalib(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst);
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN);
+
+    virtual bool performUnwrapping(WorkOrderType* workOrder3DT, const hoNDArray<T>& data);
+
+    // the partial fourier handling for the 3DT reconstruction
+    // the computation is performed on the reconstructed full kspace
+    virtual bool performPartialFourierHandling(WorkOrderType* workOrder3DT);
+
+    // perform the kspace filter on ref data for coil map estimation
+    virtual bool performRefFilter(gtPlusReconWorkOrder3DT<T>* workOrder3DT, 
+                                        const hoNDArray<T>& ref, hoNDArray<T>& refFiltered, 
+                                        int startRO, int endRO, int startE1, int endE1, int startE2, int endE2);
+
+    // for interleave, compute mean ref
+    // for embedded and separate, squeeze out the zero lines
+    virtual bool prepRef(WorkOrderType* workOrder3DT, 
+                        const hoNDArray<T>& ref, 
+                        hoNDArray<T>& refRecon, 
+                        hoNDArray<T>& refCoilMap, 
+                        int startRO, int endRO, 
+                        int startE1, int endE1, 
+                        int startE2, int endE2, 
+                        size_t dataE1, 
+                        size_t dataE2);
+
+    // implement reference data preparation
+    virtual bool prepRefByAveragingCrossN(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon);
+
+    // compute coil compression coefficients
+    virtual bool coilCompression(WorkOrderType* workOrder3DT);
+
+    // after unwrapping, for embedded and separate, the full res coil map may be estimated
+    // for embedded, the ref may be filled back to fullkspace
+    virtual bool afterUnwrapping(WorkOrderType* workOrder3DT);
+
+    // whether to recon kspace, if true, the coil combination may not be performed, only the fullkspace is computed
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT) = 0;
+
+    // ----------------------------------------------------
+    // common functions for 3DT reconstruction
+    // ----------------------------------------------------
+    // image domain kernel with coil sensitivity
+    // kerIm: [RO E1 E2 srcCHA dstCHA]
+    // coilMap: [RO E1 E2 dstCHA]
+    // unmixCoeff: [RO E1 E2 srcCHA]
+    // gFactor: [RO E1 E2]
+    bool unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor);
+
+    // apply image domain kernel
+    // kspace: [RO E1 E2 srcCHA ...]
+    // complexIm : [RO E1 E2 dstCHA ...]
+    bool applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 E2 srcCHA ...]
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm);
+    // for speed, a buffer can be provided
+    bool applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm);
+
+    // apply unmixCoeff
+    // kspace: [RO E1 E2 srcCHA ...]
+    // unmixCoeff : [RO E1 E2 srcCHA]
+    // complexIm : [RO E1 E2 ...]
+    bool applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+    // aliasedIm : [RO E1 E2 srcCHA ...]
+    bool applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm);
+
+    // ----------------------------------------------------
+    // Partial fourier handling for 3DT reconstruction
+    // ----------------------------------------------------
+    // apply the partial fourier filer along the edges
+    bool performPartialFourierFilter(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace);
+    // apply the iterative POCS for partial fourier reconstruction
+    bool performPartialFourierPOCSRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace);
+    // apply the Feng Huang partial fourier reconstruction
+    bool performPartialFourierFengHuangRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace);
+
+    //// compute Feng Huang kernel and perform recon
+    bool calibFengHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel);
+    bool performReconFangHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, int startRO, int endRO, int startE1, int endE1, int startE2, int endE2, ho6DArray<T>& kernel);
+
+    // estimate job size for 3DT recon
+    virtual bool estimateJobSize(gtPlusReconWorkOrder<T>* workOrder3DT, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+protected:
+
+    // helper memory for computation
+    hoNDArray<T> ref_src_;
+    hoNDArray<T> ref_dst_;
+    hoNDArray<T> data_dst_;
+    hoNDArray<T> ref_coil_map_dst_;
+
+    // sampled region along E1/E2
+    size_t startE1_;
+    size_t endE1_;
+
+    size_t startE2_;
+    size_t endE2_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performRecon(WorkOrderType* workOrder3DT)
+{
+    // the 3DT recon on 5D array [RO E1 E2 CHA N]
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        if ( !workOrder3DT->workFlow_use_BufferedKernel_ )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("prepRef"));
+            GADGET_CHECK_RETURN_FALSE(prepRef(workOrder3DT, workOrder3DT->ref_, 
+                                            workOrder3DT->ref_recon_, 
+                                            workOrder3DT->ref_coil_map_, 
+                                            workOrder3DT->start_RO_, workOrder3DT->end_RO_, 
+                                            workOrder3DT->start_E1_, workOrder3DT->end_E1_, 
+                                            workOrder3DT->start_E2_, workOrder3DT->end_E2_, 
+                                            workOrder3DT->data_.get_size(1), workOrder3DT->data_.get_size(2)));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("coilCompression"));
+            GADGET_CHECK_RETURN_FALSE(coilCompression(workOrder3DT));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+
+        // apply coil compression coefficients
+        if ( workOrder3DT->workFlow_use_BufferedKernel_ )
+        {
+            if ( workOrder3DT->coil_compression_ && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT )
+            {
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->data_, "data_");
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, *workOrder3DT->coilCompressionCoef_, data_dst_, true));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, data_dst_, "data_dst_");
+            }
+            else
+            {
+                data_dst_ = workOrder3DT->data_;
+            }
+        }
+        else
+        {
+            if ( workOrder3DT->coil_compression_ 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT 
+                && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT 
+                && (workOrder3DT->acceFactorE1_>1 || workOrder3DT->acceFactorE2_>1) )
+            {
+                ref_src_ = workOrder3DT->ref_recon_;
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("apply coil compression ... "));
+
+                #pragma omp parallel sections default(shared)
+                {
+                    #pragma omp section
+                    {
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_src_, "ref_src_");
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(ref_src_, *workOrder3DT->coilCompressionCoef_, ref_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(ref_src_, *workOrder3DT->coilCompressionCoef_, ref_dst_, true);
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_dst_, "ref_dst_");
+                    }
+
+                    #pragma omp section
+                    {
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->data_, "data_");
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, *workOrder3DT->coilCompressionCoef_, data_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, *workOrder3DT->coilCompressionCoef_, data_dst_, true);
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, data_dst_, "data_dst_");
+                    }
+
+                    #pragma omp section
+                    {
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->ref_coil_map_, "ref_coil_map_");
+                        //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_coil_map_, *workOrder3DT->coilCompressionCoef_, ref_coil_map_dst_, true));
+                        gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_coil_map_, *workOrder3DT->coilCompressionCoef_, ref_coil_map_dst_, true);
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_coil_map_dst_, "ref_coil_map_dst_");
+                    }
+                }
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                if ( !workOrder3DT->downstream_coil_compression_ || workOrder3DT->recon_algorithm_==ISMRMRD_SPIRIT || workOrder3DT->recon_algorithm_==ISMRMRD_L1SPIRIT )
+                {
+                    ref_src_ = ref_dst_;
+                }
+            }
+            else
+            {
+                ref_src_ = workOrder3DT->ref_recon_;
+                ref_dst_ = workOrder3DT->ref_recon_;
+                data_dst_ = workOrder3DT->data_;
+                ref_coil_map_dst_ = workOrder3DT->ref_coil_map_;
+            }
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("estimate coil map"));
+            GADGET_CHECK_RETURN_FALSE(this->estimateCoilMap(workOrder3DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+            if ( workOrder3DT->acceFactorE1_>1 || workOrder3DT->acceFactorE2_>1 )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performCalib"));
+                GADGET_CHECK_RETURN_FALSE(this->performCalib(workOrder3DT, ref_src_, ref_dst_, ref_coil_map_dst_));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+            }
+        }
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performUnwrapping"));
+        GADGET_CHECK_RETURN_FALSE(this->performUnwrapping(workOrder3DT, data_dst_));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("afterUnwrapping"));
+        GADGET_CHECK_RETURN_FALSE(this->afterUnwrapping(workOrder3DT));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performRecon(WorkOrderType* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+estimateCoilMap(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+        size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refE2 = ref_dst.get_size(2);
+        size_t refN = ref_dst.get_size(4);
+        size_t dstCHA = ref_coil_map_dst.get_size(3);
+
+        bool same_combinationcoeff_allN = false;
+        size_t whichN_combinationcoeff = 0;
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allN = true;
+            whichN_combinationcoeff = 0;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allN = workOrder3DT->embedded_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->embedded_whichN_combinationcoeff_;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allN = workOrder3DT->separate_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->separate_whichN_combinationcoeff_;
+        }
+
+        if ( whichN_combinationcoeff >= refN ) whichN_combinationcoeff=refN-1;
+
+        bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+        // if the coil map has not been preset
+        if ( !reconKSpace )
+        {
+            if ( (workOrder3DT->coilMap_->get_size(0)!=RO) 
+                || (workOrder3DT->coilMap_->get_size(1)!=E1)
+                || (workOrder3DT->coilMap_->get_size(2)!=E2) )
+            {
+                if ( same_combinationcoeff_allN )
+                {
+                    size_t usedN = whichN_combinationcoeff;
+
+                    hoNDArray<T> refCoilMapN(RO, E1, E2, dstCHA, const_cast<T*>(ref_coil_map_dst.begin()+usedN*RO*E1*E2*dstCHA));
+
+                    workOrder3DT->coilMap_->create(RO, E1, E2, dstCHA, refN);
+                    //Gadgetron::clear(workOrder3DT->coilMap_.get());
+
+                    // hoNDArray<T> coilMapN(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*dstCHA);
+                    // hoNDArray<T> coilMapN(RO, E1, E2, dstCHA);
+                    hoNDArray<T> coilMapN(RO, E1, E2, dstCHA);
+
+                    hoNDArray<T> buffer3DT(RO, E1, E2, dstCHA);
+
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(refCoilMapN, buffer3DT));
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("coil map estimation ... "));
+                    if ( workOrder3DT->csm_use_gpu_ )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(buffer3DT, 
+                                coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, 
+                                workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                                coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, 
+                                workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                    }
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    memcpy(workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*dstCHA, coilMapN.begin(), coilMapN.get_number_of_bytes());
+                    GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->coilMap_, usedN));
+                }
+                else
+                {
+                    hoNDArray<T> buffer3DT(ref_coil_map_dst.get_dimensions());
+
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(ref_coil_map_dst, buffer3DT));
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("coil map estimation ... "));
+                    if ( workOrder3DT->csm_use_gpu_ )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(buffer3DT, 
+                                *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, 
+                                workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, 
+                                workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                                *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, 
+                                workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, 
+                                workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                    }
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+                }
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->coilMap_, "coilMap_");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::estimateCoilMap(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+performCalib(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, const hoNDArray<T>& ref_coil_map_dst)
+{
+    try
+    {
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+        size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+        size_t refRO = ref_dst.get_size(0);
+        size_t refE1 = ref_dst.get_size(1);
+        size_t refE2 = ref_dst.get_size(2);
+        size_t refN = ref_dst.get_size(4);
+        size_t dstCHA = ref_coil_map_dst.get_size(3);
+
+        bool same_combinationcoeff_allN = false;
+        size_t whichN_combinationcoeff = 0;
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            same_combinationcoeff_allN = true;
+            whichN_combinationcoeff = 0;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            same_combinationcoeff_allN = workOrder3DT->embedded_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->embedded_whichN_combinationcoeff_;
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+        {
+            same_combinationcoeff_allN = workOrder3DT->separate_same_combinationcoeff_allN_;
+            whichN_combinationcoeff = workOrder3DT->separate_whichN_combinationcoeff_;
+        }
+
+        if ( whichN_combinationcoeff >= refN ) whichN_combinationcoeff=refN-1;
+
+        bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+        // calibration
+        if ( (workOrder3DT->kernelIm_->get_size(3)!=srcCHA) || (workOrder3DT->kernelIm_->get_size(4)!=dstCHA) )
+        {
+           GADGET_CHECK_RETURN_FALSE(this->performCalibPrep(ref_src, ref_dst, workOrder3DT));
+
+            size_t n;
+
+            // perform calibration
+            if ( same_combinationcoeff_allN )
+            {
+                size_t usedN = whichN_combinationcoeff;
+
+                this->performCalibImpl(ref_src, ref_dst, workOrder3DT, usedN);
+
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->kernel_, usedN));
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->kernelIm_, usedN));
+
+                if ( !reconKSpace )
+                {
+                    if ( workOrder3DT->unmixingCoeffIm_ && (workOrder3DT->unmixingCoeffIm_->get_number_of_elements()>0) )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->unmixingCoeffIm_, usedN));
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(workOrder3DT->gfactor_, usedN));
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->unmixingCoeffIm_, "unmixingCoeffIm_");
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->gfactor_, "gfactor_");
+                    }
+                }
+            }
+            else
+            {
+                int usedN;
+                #ifdef USE_OMP
+                    omp_set_nested(1);
+                #endif // USE_OMP
+
+                #ifdef GCC_OLD_FLAG
+                    #pragma omp parallel for default(none) private(usedN) shared(N, workOrder3DT, reconKSpace)
+                #else
+                    #pragma omp parallel for default(none) private(usedN) shared(N, ref_src, ref_dst, workOrder3DT, reconKSpace)
+                #endif
+                for ( usedN=0; usedN<(int)N; usedN++ )
+                {
+                    this->performCalibImpl(ref_src, ref_dst, workOrder3DT, usedN);
+                }
+
+                #ifdef USE_OMP
+                    omp_set_nested(0);
+                #endif // USE_OMP
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performCalib(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+performCalibPrep(const hoNDArray<T>& , const hoNDArray<T>& , WorkOrderType* /*workOrder2DT*/)
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+performCalibImpl(const hoNDArray<T>& , const hoNDArray<T>& , WorkOrderType* , size_t )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performUnwrapping(WorkOrderType* , const hoNDArray<T>& )
+{
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performRefFilter(WorkOrderType* workOrder3DT, 
+                                            const hoNDArray<T>& ref, 
+                                            hoNDArray<T>& refFiltered, 
+                                            int startRO, int endRO, 
+                                            int startE1, int endE1, 
+                                            int startE2, int endE2)
+{
+    try
+    {
+        refFiltered = ref;
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t E2 = ref.get_size(2);
+
+        if ( workOrder3DT->filterROE1E2_ref_.get_size(0)==RO 
+            && workOrder3DT->filterROE1E2_ref_.get_size(1)==E1 
+            && workOrder3DT->filterROE1E2_ref_.get_size(2)==E2 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(ref, workOrder3DT->filterROE1E2_ref_, refFiltered));
+        }
+        else if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+            && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+            && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(ref, workOrder3DT->filterRO_ref_, workOrder3DT->filterE1_ref_, workOrder3DT->filterE2_ref_, refFiltered));
+        }
+        else
+        {
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()!=E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()!=E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterRO(ref, workOrder3DT->filterRO_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()!=RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()!=E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterE1(ref, workOrder3DT->filterE1_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()!=RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()!=E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterE2(ref, workOrder3DT->filterE2_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()!=E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspacefilterROE1(ref, workOrder3DT->filterRO_ref_, workOrder3DT->filterE1_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()==RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()!=E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE2(ref, workOrder3DT->filterRO_ref_, workOrder3DT->filterE2_ref_, refFiltered));
+            }
+
+            if ( (workOrder3DT->filterRO_ref_.get_number_of_elements()!=RO) 
+                && (workOrder3DT->filterE1_ref_.get_number_of_elements()==E1) 
+                && (workOrder3DT->filterE2_ref_.get_number_of_elements()==E2) )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterE1E2(ref, workOrder3DT->filterE1_ref_, workOrder3DT->filterE2_ref_, refFiltered));
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performRefFilter(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::prepRefByAveragingCrossN(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref, bool averageAllRef, int numOfModes, hoNDArray<T>& refRecon)
+{
+    try
+    {
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t E2 = ref.get_size(2);
+        size_t CHA = ref.get_size(3);
+        size_t N = ref.get_size(4);
+
+        if ( !averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            refRecon = ref;
+        }
+        else if ( averageAllRef && ( (numOfModes<1) || (numOfModes>N-1) ) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(ref, refRecon));
+        }
+        else if ( averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            hoNDArray<T> refKLF(RO, E1, E2, CHA, N);
+            Gadgetron::clear(refKLF);
+
+            hoMatrix<T> A(RO*E1*E2*CHA, N, const_cast<T*>(ref.begin()));
+            hoMatrix<T> A_KLF(RO*E1*E2*CHA, N, refKLF.begin());
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refKLF, "refKLF");
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(refKLF, refRecon));
+        }
+        else if ( !averageAllRef && (numOfModes>=1) && (numOfModes<=N-1) )
+        {
+            refRecon.create(RO, E1, E2, CHA, N);
+            Gadgetron::clear(refRecon);
+
+            hoMatrix<T> A(RO*E1*E2*CHA, N, const_cast<T*>(ref.begin()));
+            hoMatrix<T> A_KLF(RO*E1*E2*CHA, N, refRecon.begin());
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLFilter(A, numOfModes, A_KLF));
+        }
+        else
+        {
+            refRecon = ref;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::prepRefByAveragingCrossN(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::prepRef(WorkOrderType* workOrder3DT, const hoNDArray<T>& ref, 
+                                hoNDArray<T>& refRecon, hoNDArray<T>& refCoilMap, 
+                                int startRO, int endRO, 
+                                int startE1, int endE1, 
+                                int startE2, int endE2, 
+                                size_t dataE1, size_t dataE2)
+{
+    try
+    {
+        size_t dataRO = workOrder3DT->data_.get_size(0);
+        size_t dataN = workOrder3DT->data_.get_size(4);
+
+        size_t RO = ref.get_size(0);
+        size_t E1 = ref.get_size(1);
+        size_t E2 = ref.get_size(2);
+        size_t srcCHA = ref.get_size(3);
+        size_t N = ref.get_size(4);
+
+        if ( workOrder3DT->acceFactorE1_ == 1 && workOrder3DT->acceFactorE2_ == 1 )
+        {
+            if ( workOrder3DT->no_acceleration_averageall_ref_ )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, ref, workOrder3DT->no_acceleration_averageall_ref_, 0, refRecon));
+            }
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+        }
+        else if ( workOrder3DT->CalibMode_ == ISMRMRD_interleaved )
+        {
+            GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, ref, true, 0, refRecon));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon_interleaved");
+
+            GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refRecon, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+
+            if ( (startRO>=0 && endRO>0 && endRO>startRO) || (startE1>=0 && endE1>0 && endE1>startE1) || (startE2>=0 && endE2>0 && endE2>startE2) )
+            {
+                std::vector<size_t> crop_offset(5), crop_size(5);
+
+                crop_offset[0] = 0;
+                crop_offset[1] = 0;
+                crop_offset[2] = 0;
+                crop_offset[3] = 0;
+                crop_offset[4] = 0;
+
+                crop_size[0] = RO;
+                crop_size[1] = E1;
+                crop_size[2] = refRecon.get_size(2);
+                crop_size[3] = refRecon.get_size(3);
+                crop_size[4] = refRecon.get_size(4);
+
+                if (startRO>=0 && endRO>0 && endRO>startRO)
+                {
+                    crop_offset[0] = startRO;
+                    crop_size[0] = endRO-startRO+1;
+                }
+
+                if (startE1>=0 && endE1>0 && endE1>startE1)
+                {
+                    crop_offset[1] = startE1;
+                    crop_size[1] = endE1-startE1+1;
+                }
+
+                if (startE2>=0 && endE2>0 && endE2>startE2)
+                {
+                    crop_offset[2] = startE2;
+                    crop_size[2] = endE2-startE2+1;
+                }
+
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+        }
+        else if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded 
+                || workOrder3DT->CalibMode_ == ISMRMRD_separate 
+                || workOrder3DT->CalibMode_ == ISMRMRD_external )
+        {
+            if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                refRecon = ref;
+            }
+
+            if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+            {
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, ref, workOrder3DT->separate_averageall_ref_, 0, refRecon));
+            }
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("detectSampledRegionE1E2 ... "));
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.detectSampledRegionE1E2(refRecon, startE1_, endE1_, startE2_, endE2_));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+            std::vector<size_t> crop_offset(5);
+            crop_offset[0] = 0;
+            crop_offset[1] = startE1_;
+            crop_offset[2] = startE2_;
+            crop_offset[3] = 0;
+            crop_offset[4] = 0;
+
+            std::vector<size_t> crop_size(5);
+            crop_size[0] = refRecon.get_size(0);
+            crop_size[1] = endE1_-startE1_+1;
+            crop_size[2] = endE2_-startE2_+1;
+            crop_size[3] = srcCHA;
+            crop_size[4] = refRecon.get_size(4);
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon_beforeCrop");
+
+            if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("crop sampled region ... "));
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, croppedRef, "refRecon_afterCrop");
+
+                if ( workOrder3DT->recon_algorithm_ == ISMRMRD_SPIRIT || workOrder3DT->recon_algorithm_ == ISMRMRD_L1SPIRIT )
+                {
+                    // copy the ref into the data
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2(refRecon, workOrder3DT->data_, 0, refRecon.get_size(0)-1, startE1_, endE1_, startE2_, endE2_));
+                }
+
+                GADGET_CHECK_RETURN_FALSE(prepRefByAveragingCrossN(workOrder3DT, croppedRef, workOrder3DT->embedded_averageall_ref_, 0, refRecon));
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon_afterCrop_prepCrossN");
+
+                crop_size[4] = refRecon.get_size(4);
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("set up ref for coil map ... "));
+                refCoilMap.create(RO, E1, E2, srcCHA, refRecon.get_size(4));
+                GADGET_CHECK_RETURN_FALSE(setSubArrayUpTo10DArray(refRecon, refCoilMap, crop_offset, crop_size));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap");
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("perform ref coil map filter ... "));
+                // hoNDArray<T> refCoilMapTmp(refCoilMap);
+
+                // GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refCoilMapTmp, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, refCoilMap, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap_filtered");
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+
+                        crop_offset[2] = 0;
+                        crop_size[2] = refRecon.get_size(2);
+                    }
+                }
+
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                refRecon = croppedRef;
+            }
+            else
+            {
+                hoNDArray<T> croppedRef;
+                GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, croppedRef, "croppedRef");
+
+                GADGET_CHECK_RETURN_FALSE(performRefFilter(workOrder3DT, croppedRef, refCoilMap, startRO, endRO, startE1, endE1, startE2, endE2));
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "croppedRef_filtered");
+
+                refRecon = croppedRef;
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.zeropad3D(refCoilMap, dataRO, dataE1, dataE2, croppedRef));
+                refCoilMap = croppedRef;
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap");
+
+                if ( refRecon.get_size(0) == RO )
+                {
+                    if ( startRO>=0 && endRO>0 && endRO>startRO && startRO<RO && endRO<RO )
+                    {
+                        crop_offset[0] = startRO;
+                        crop_size[0] = endRO-startRO+1;
+
+                        crop_offset[1] = 0;
+                        crop_size[1] = refRecon.get_size(1);
+
+                        crop_offset[2] = 0;
+                        crop_size[2] = refRecon.get_size(2);
+
+                        GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(refRecon, croppedRef, crop_offset, crop_size));
+                        refRecon = croppedRef;
+                    }
+                }
+            }
+
+            // if the ref N is smaller than the data N, e.g. in some cases with the separate mode
+            // make sure every data N have its ref data
+            if ( N < dataN )
+            {
+                hoNDArray<T> refReconDataN(refRecon.get_size(0), refRecon.get_size(1), refRecon.get_size(2), refRecon.get_size(3), dataN);
+                hoNDArray<T> refCoilMapDataN(refCoilMap.get_size(0), refCoilMap.get_size(1), refCoilMap.get_size(2), refCoilMap.get_size(3), dataN);
+
+                memcpy(refReconDataN.begin(), refRecon.begin(), refRecon.get_number_of_bytes());
+                memcpy(refCoilMapDataN.begin(), refCoilMap.begin(), refCoilMap.get_number_of_bytes());
+
+                size_t refReconN4D = refRecon.get_size(0)*refRecon.get_size(1)*refRecon.get_size(2)*refRecon.get_size(3);
+                size_t refCoilMapN4D = refCoilMap.get_size(0)*refCoilMap.get_size(1)*refCoilMap.get_size(2)*refCoilMap.get_size(3);
+
+                size_t n;
+                for ( n=N; n<dataN; n++ )
+                {
+                    memcpy(refReconDataN.begin()+n*refReconN4D, refRecon.begin()+(N-1)*refReconN4D, sizeof(T)*refReconN4D);
+                    memcpy(refCoilMapDataN.begin()+n*refCoilMapN4D, refCoilMap.begin()+(N-1)*refCoilMapN4D, sizeof(T)*refCoilMapN4D);
+                }
+
+                refRecon = refReconDataN;
+                refCoilMap = refCoilMapDataN;
+            }
+        }
+        else
+        {
+            GADGET_ERROR_MSG("CalibMode is not supported in gtPlusReconWorker3DT<T>::prepRef(...) : " << workOrder3DT->CalibMode_);
+            return false;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refRecon, "refRecon");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, refCoilMap, "refCoilMap");
+
+        // if the upstream coil compression is needed
+        if ( workOrder3DT->upstream_coil_compression_ )
+        {
+            GADGET_CHECK_PERFORM(!debugFolder_.empty(), "Upstream coil compression ... ");
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("average along N ... "));
+            hoNDArray<T> aveAll;
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(refRecon, aveAll));
+            aveAll.squeeze();
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("compute coil compression coefficients ... "));
+            hoMatrix<T> coeff, eigenValues;
+            if ( workOrder3DT->upstream_coil_compression_num_modesKept_ > 0 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                            workOrder3DT->upstream_coil_compression_num_modesKept_, coeff, eigenValues, true));
+            }
+            else
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                            workOrder3DT->upstream_coil_compression_thres_, coeff, eigenValues, true));
+            }
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+            // GADGET_CHECK_PERFORM(!debugFolder_.empty(), eigenValues.print(std::cout));
+            eigenValues.print(std::cout);
+            GADGET_MSG("Upstream coil compression, number of channel kept is " << coeff.cols());
+
+            size_t n;
+            std::vector<hoMatrix<T> > upstreamCoilCoeffRef(workOrder3DT->ref_.get_size(4)), upstreamCoilCoeffRefRecon(refRecon.get_size(4)), upstreamCoilCoeffData(workOrder3DT->data_.get_size(4));
+            for ( n=0; n<upstreamCoilCoeffRef.size(); n++ )
+            {
+                upstreamCoilCoeffRef[n] = coeff;
+            }
+
+            for ( n=0; n<upstreamCoilCoeffRefRecon.size(); n++ )
+            {
+                upstreamCoilCoeffRefRecon[n] = coeff;
+            }
+
+            for ( n=0; n<upstreamCoilCoeffData.size(); n++ )
+            {
+                upstreamCoilCoeffData[n] = coeff;
+            }
+
+            // apply the coil compression
+            #ifdef USE_OMP
+                omp_set_nested(1);
+            #endif // USE_OMP
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("apply upstream coil compression ... "));
+            #pragma omp parallel sections default(shared)
+            {
+
+                #pragma omp section
+                {
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("apply the coil compression on data ... "));
+                    // GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, upstreamCoilCoeffData, data_dst_, true));
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("applyKLCoilCompressionCoeff ... "));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->data_, upstreamCoilCoeffData, data_dst_, true);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("copy data ... "));
+                    workOrder3DT->data_ = data_dst_;
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                }
+
+                #pragma omp section
+                {
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("apply the coil compression on ref ... "));
+                    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_, upstreamCoilCoeff, ref_dst_, true));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(workOrder3DT->ref_, upstreamCoilCoeffRef, ref_dst_, true);
+                    workOrder3DT->ref_ = ref_dst_;
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                }
+
+                #pragma omp section
+                {
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("apply the coil compression on refRecon ... "));
+                    hoNDArray<T> refRecon_upstream;
+                    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refRecon, upstreamCoilCoeff, refRecon_upstream, true));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refRecon, upstreamCoilCoeffRefRecon, refRecon_upstream, true);
+                    refRecon = refRecon_upstream;
+                    refRecon_upstream.clear();
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                }
+
+                #pragma omp section
+                {
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("apply the coil compression on ref for coil map ... "));
+                    hoNDArray<T> refCoilMap_upstream;
+                    //GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refCoilMap, upstreamCoilCoeff, refCoilMap_upstream, true));
+                    gtPlusISMRMRDReconUtil<T>().applyKLCoilCompressionCoeff(refCoilMap, upstreamCoilCoeffRefRecon, refCoilMap_upstream, true);
+                    refCoilMap = refCoilMap_upstream;
+                    refCoilMap_upstream.clear();
+                    //GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                }
+            }
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+            #ifdef USE_OMP
+                omp_set_nested(0);
+            #endif // USE_OMP
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::prepRef(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::coilCompression(WorkOrderType* workOrder3DT)
+{
+    // the 3DT recon on 5D array [RO E1 E2 CHA N]
+    try
+    {
+        size_t RO = workOrder3DT->ref_recon_.get_size(0);
+        size_t E1 = workOrder3DT->ref_recon_.get_size(1);
+        size_t E2 = workOrder3DT->ref_recon_.get_size(2);
+        size_t srcCHA = workOrder3DT->ref_recon_.get_size(3);
+        size_t N = workOrder3DT->ref_recon_.get_size(4);
+
+        size_t dataN = workOrder3DT->data_.get_size(4);
+
+        size_t n;
+
+        if ( workOrder3DT->acceFactorE1_==1 && workOrder3DT->acceFactorE2_==1 ) return true;
+
+        // compute coil compression coeff
+        if ( workOrder3DT->coil_compression_ && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT )
+        {
+            // check whether coil compression coeff has been preset
+            if ( workOrder3DT->coilCompressionCoef_->size()!=dataN )
+            {
+                if ( workOrder3DT->same_coil_compression_coeff_allN_ )
+                {
+                    hoNDArray<T> aveAll;
+                    GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(workOrder3DT->ref_recon_, aveAll));
+                    aveAll.squeeze();
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aveAll, "aveAll");
+
+                    hoMatrix<T> coeff, eigenValues;
+                    if ( workOrder3DT->coil_compression_num_modesKept_ > 0 )
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                                    workOrder3DT->coil_compression_num_modesKept_, coeff, eigenValues, true));
+                    }
+                    else
+                    {
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(aveAll, 
+                                    workOrder3DT->coil_compression_thres_, coeff, eigenValues, true));
+                    }
+
+                    workOrder3DT->coilCompressionCoef_->resize(dataN);
+
+                    for ( n=0; n<dataN; n++ )
+                    {
+                        (*workOrder3DT->coilCompressionCoef_)[n] = coeff;
+                    }
+
+                    GADGET_CHECK_PERFORM(!debugFolder_.empty(), eigenValues.print(std::cout));
+                    GADGET_MSG("Coil compression, number of channel kept is " << coeff.cols());
+                }
+                else
+                {
+                    std::vector<size_t> allNDim(4);
+                    allNDim[0] = RO;
+                    allNDim[1] = E1;
+                    allNDim[2] = E2;
+                    allNDim[3] = srcCHA;
+
+                    size_t num_modesKept = srcCHA;
+
+                    for ( n=0; n<N; n++ )
+                    {
+                        hoNDArray<T> dataCurrN(&allNDim, workOrder3DT->ref_recon_.begin()+n*RO*E1*E2*srcCHA, false);
+
+                        hoMatrix<T> coeff, eigenValues;
+
+                        if ( n == 0 )
+                        {
+                            if ( workOrder3DT->coil_compression_num_modesKept_ > 0 )
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrN, 
+                                            workOrder3DT->coil_compression_num_modesKept_, coeff, eigenValues, true));
+                            }
+                            else
+                            {
+                                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrN, 
+                                            workOrder3DT->coil_compression_thres_, coeff, eigenValues, true));
+                            }
+
+                            num_modesKept = coeff.get_size(0);
+                            workOrder3DT->coilCompressionCoef_->push_back(coeff);
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.computeKLCoilCompressionCoeff(dataCurrN, 
+                                            (int)num_modesKept, coeff, eigenValues, true));
+
+                            workOrder3DT->coilCompressionCoef_->push_back(coeff);
+                        }
+
+                        GADGET_CHECK_PERFORM(!debugFolder_.empty(), eigenValues.print(std::cout));
+                        GADGET_MSG("Coil compression, number of channel kept is " << coeff.cols());
+                    }
+                }
+            }
+
+            if ( N < dataN )
+            {
+                std::vector<hoMatrix<T> > coilCompressionCoef(dataN);
+                for ( n=0; n<N; n++ )
+                {
+                    coilCompressionCoef[n] = (*workOrder3DT->coilCompressionCoef_)[n];
+                }
+
+                for ( n=N; n<dataN; n++ )
+                {
+                    coilCompressionCoef[n] = (*workOrder3DT->coilCompressionCoef_)[N-1];
+                }
+
+                *(workOrder3DT->coilCompressionCoef_) = coilCompressionCoef;
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::coilCompression(WorkOrderType* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t E2 = kerIm.get_size(2);
+        size_t srcCHA = kerIm.get_size(3);
+        size_t dstCHA = kerIm.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(2)==E2);
+        GADGET_CHECK_RETURN_FALSE(coilMap.get_size(3)==dstCHA);
+
+        unmixCoeff.create(RO, E1, E2, srcCHA);
+        Gadgetron::clear(&unmixCoeff);
+        gFactor.create(RO, E1, E2);
+        Gadgetron::clear(&gFactor);
+
+        int src;
+
+        T* pKerIm = const_cast<T*>(kerIm.begin());
+        T* pCoilMap = const_cast<T*>(coilMap.begin());
+        T* pCoeff = unmixCoeff.begin();
+
+        std::vector<size_t> dim(3);
+        dim[0] = RO;
+        dim[1] = E1;
+        dim[2] = E2;
+
+        #pragma omp parallel default(none) private(src) shared(RO, E1, E2, srcCHA, dstCHA, pKerIm, pCoilMap, pCoeff, dim)
+        {
+            hoNDArray<T> coeff2D, coeffTmp(&dim);
+            hoNDArray<T> coilMap2D;
+            hoNDArray<T> kerIm2D;
+
+            #pragma omp for
+            for ( src=0; src<(int)srcCHA; src++ )
+            {
+                coeff2D.create(&dim, pCoeff+src*RO*E1*E2);
+
+                for ( size_t dst=0; dst<dstCHA; dst++ )
+                {
+                    kerIm2D.create(&dim, pKerIm+src*RO*E1*E2+dst*RO*E1*E2*srcCHA);
+                    coilMap2D.create(&dim, pCoilMap+dst*RO*E1*E2);
+                    Gadgetron::multiplyConj(kerIm2D, coilMap2D, coeffTmp);
+                    Gadgetron::add(coeff2D, coeffTmp, coeff2D);
+                }
+            }
+        }
+
+        hoNDArray<T> conjUnmixCoeff(unmixCoeff);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multiplyConj(unmixCoeff, conjUnmixCoeff, conjUnmixCoeff));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOverLastDimension(conjUnmixCoeff, gFactor));
+        Gadgetron::sqrt_inplace(&gFactor);
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::unmixCoeff(const hoNDArray<T>& kerIm, const hoNDArray<T>& coilMap, hoNDArray<T>& unmixCoeff, hoNDArray<T>& gFactor) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t E2 = kerIm.get_size(2);
+        size_t srcCHA = kerIm.get_size(3);
+        size_t dstCHA = kerIm.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==E2);
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(3)==srcCHA);
+
+        // buffer3DT_unwrapping_ = kspace;
+
+        hoNDArray<T> buffer3DT(kspace.get_dimensions());
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspace, buffer3DT));
+
+        GADGET_CHECK_RETURN_FALSE(applyImageDomainKernelImage(buffer3DT, kerIm, complexIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::applyImageDomainKernel(const hoNDArray<T>& kspace, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& complexIm)
+{
+    hoNDArrayMemoryManaged<T> buf5D(kerIm.get_dimensions(), gtPlus_mem_manager_);
+    return applyImageDomainKernelImage(aliasedIm, kerIm, this->buf4D, complexIm);
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        size_t RO = kerIm.get_size(0);
+        size_t E1 = kerIm.get_size(1);
+        size_t E2 = kerIm.get_size(2);
+        size_t srcCHA = kerIm.get_size(3);
+        size_t dstCHA = kerIm.get_size(4);
+
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==RO);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==E1);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==E2);
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(3)==srcCHA);
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+        std::vector<size_t> dimIm(*dim);
+        dimIm[3] = dstCHA;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        std::vector<size_t> dim4D(4);
+        dim4D[0] = RO;
+        dim4D[1] = E1;
+        dim4D[2] = E2;
+        dim4D[3] = srcCHA;
+
+        std::vector<size_t> dimIm4D(4);
+        dimIm4D[0] = RO;
+        dimIm4D[1] = E1;
+        dimIm4D[2] = E2;
+        dimIm4D[3] = dstCHA;
+
+        size_t num = aliasedIm.get_number_of_elements()/ (RO*E1*E2*srcCHA);
+
+        int n;
+
+        //if ( num <= 16 )
+        //{
+            //#ifdef USE_OMP
+            //    omp_set_nested(1);
+            //#endif // USE_OMP
+
+            //// #pragma omp parallel default(none) private(n) shared(kerIm, num, dim4D, aliasedIm, RO, E1, E2, srcCHA, dimIm4D, dstCHA, complexIm) if ( num >= 16 )
+            //for ( n=0; n<(int)num; n++ )
+            //{
+            //    hoNDArray<T> buf4D(&dim4D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*E2*srcCHA));
+            //    // hoNDArray<T> bufIm4D(&dimIm4D, complexIm.begin()+n*RO*E1*E2*dstCHA);
+
+            //    int dCha;
+
+            //    #pragma omp parallel default(none) private(dCha) shared(n, buf4D, kerIm, kerImBuffer, num, dim4D, aliasedIm, RO, E1, E2, srcCHA, dimIm4D, dstCHA, complexIm)
+            //    {
+            //        // hoNDArrayMemoryManaged<T> unwarpped4D(RO, E1, E1, srcCHA, gtPlus_mem_manager_);
+
+            //        #pragma omp for
+            //        for ( dCha=0; dCha<(int)dstCHA; dCha++ )
+            //        {
+            //            hoNDArray<T> kerIm4D(RO, E1, E2, srcCHA, const_cast<T*>(kerIm.begin()+dCha*RO*E1*E2*srcCHA));
+            //            hoNDArray<T> complexIm3D(RO, E1, E2, complexIm.begin()+n*RO*E1*E2*dstCHA+dCha*RO*E1*E2);
+            //            hoNDArray<T> unwrapped4D(RO, E1, E2, srcCHA, kerImBuffer.begin()+dCha*RO*E1*E2*srcCHA);
+            //            Gadgetron::multipleMultiply(buf4D, kerIm4D, unwrapped4D);
+            //            Gadgetron::sumOverLastDimension(unwrapped4D, complexIm3D);
+            //        }
+            //    }
+            //}
+
+            #ifdef USE_OMP
+                omp_set_nested(1);
+            #endif // USE_OMP
+
+            //for ( n=0; n<(int)num; n++ )
+            //{
+            //    hoNDArray<T> buf4D(&dim4D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*E2*srcCHA));
+
+            //    int dCha;
+
+            //    //hoNDArrayMemoryManaged<T> unwarpped4D(RO, E1, E2, srcCHA, gtPlus_mem_manager_);
+
+            //    hoNDArray<T> unwrapped4D(RO, E1, E2, srcCHA, kerImBuffer.begin());
+
+            //    for ( dCha=0; dCha<(int)dstCHA; dCha++ )
+            //    {
+            //        hoNDArray<T> kerIm4D(RO, E1, E2, srcCHA, const_cast<T*>(kerIm.begin()+dCha*RO*E1*E2*srcCHA));
+            //        hoNDArray<T> complexIm3D(RO, E1, E2, complexIm.begin()+n*RO*E1*E2*dstCHA+dCha*RO*E1*E2);
+            //        Gadgetron::multipleMultiply(buf4D, kerIm4D, unwrapped4D);
+            //        Gadgetron::sumOverLastDimension(unwrapped4D, complexIm3D);
+            //    }
+            //}
+
+            #ifdef GCC_OLD_FLAG
+                #pragma omp parallel default(none) private(n) shared(num, dim4D, RO, E1, E2, srcCHA, dstCHA) num_threads( ((num<16) ? num : 16) )
+            #else
+                #pragma omp parallel default(none) private(n) shared(num, dim4D, aliasedIm, RO, E1, E2, srcCHA, dstCHA, kerIm, complexIm) num_threads( ((num<16) ? num : 16) )
+            #endif
+            {
+                hoNDArrayMemoryManaged<T> unwrapped4D(RO, E1, E2, srcCHA, gtPlus_mem_manager_);
+
+                #pragma omp for
+                for ( n=0; n<(int)num; n++ )
+                {
+                    hoNDArray<T> buf4D(&dim4D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*E2*srcCHA));
+
+                    int dCha;
+
+                    //hoNDArrayMemoryManaged<T> unwarpped4D(RO, E1, E2, srcCHA, gtPlus_mem_manager_);
+
+                    //hoNDArray<T> unwrapped4D(RO, E1, E2, srcCHA, kerImBuffer.begin());
+
+                    for ( dCha=0; dCha<(int)dstCHA; dCha++ )
+                    {
+                        hoNDArray<T> kerIm4D(RO, E1, E2, srcCHA, const_cast<T*>(kerIm.begin()+dCha*RO*E1*E2*srcCHA));
+                        hoNDArray<T> complexIm3D(RO, E1, E2, complexIm.begin()+n*RO*E1*E2*dstCHA+dCha*RO*E1*E2);
+                        Gadgetron::multipleMultiply(buf4D, kerIm4D, unwrapped4D);
+                        Gadgetron::sumOverLastDimension(unwrapped4D, complexIm3D);
+                    }
+                }
+            }
+        //}
+        //else
+        //{
+        //    #pragma omp parallel default(none) private(n) shared(kerIm, num, dim4D, aliasedIm, RO, E1, E2, srcCHA, dimIm4D, dstCHA, complexIm) 
+        //    {
+        //        hoNDArray<T> buf4D;
+        //        hoNDArray<T> bufIm4D;
+        //        hoNDArray<T> buf5D(kerIm.get_dimensions());
+
+        //        #pragma omp for
+        //        for ( n=0; n<(int)num; n++ )
+        //        {
+        //            buf4D.create(&dim4D, const_cast<T*>(aliasedIm.begin()+n*RO*E1*E2*srcCHA));
+        //            bufIm4D.create(&dimIm4D, complexIm.begin()+n*RO*E1*E2*dstCHA);
+
+        //            Gadgetron::multipleMultiply(buf4D, kerIm, buf5D);
+        //            Gadgetron::sumOverSecondLastDimension(buf5D, bufIm4D);
+        //        }
+        //    }
+        //}
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::applyImageDomainKernelImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& kerIm, hoNDArray<T>& kerImBuffer, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(2)==unmixCoeff.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(kspace.get_size(3)==unmixCoeff.get_size(3));
+
+        // buffer3DT_unwrapping_ = kspace;
+        hoNDArrayMemoryManaged<T> buffer3DT(kspace.get_dimensions(), gtPlus_mem_manager_);
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspace, buffer3DT));
+        GADGET_CHECK_RETURN_FALSE(applyUnmixCoeffImage(buffer3DT, unmixCoeff, complexIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::applyUnmixCoeff(const hoNDArray<T>& kspace, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(0)==unmixCoeff.get_size(0));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(1)==unmixCoeff.get_size(1));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(2)==unmixCoeff.get_size(2));
+        GADGET_CHECK_RETURN_FALSE(aliasedIm.get_size(3)==unmixCoeff.get_size(3));
+
+        boost::shared_ptr< std::vector<size_t> > dim = aliasedIm.get_dimensions();
+        std::vector<size_t> dimIm(*dim);
+        dimIm[3] = 1;
+
+        if ( !complexIm.dimensions_equal(&dimIm) )
+        {
+            complexIm.create(&dimIm);
+        }
+        Gadgetron::clear(&complexIm);
+
+        // hoNDArray<T> tmp(aliasedIm);
+        // buffer3DT_unwrapping_ = aliasedIm;
+
+        hoNDArrayMemoryManaged<T> buffer3DT(aliasedIm.get_dimensions(), gtPlus_mem_manager_);
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::multipleMultiply(unmixCoeff, aliasedIm, buffer3DT));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::sumOver4thDimension(buffer3DT, complexIm));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::applyUnmixCoeffImage(const hoNDArray<T>& aliasedIm, const hoNDArray<T>& unmixCoeff, hoNDArray<T>& complexIm) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::afterUnwrapping(WorkOrderType* workOrder3DT)
+{
+    try
+    {
+        bool fullres_coilmap = false;
+        bool ref_fillback = false;
+        bool averageallN_coilmap = false;
+        bool same_coilmap_allN = false;
+        size_t whichN_coilmap = 0;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+        {
+            if ( workOrder3DT->embedded_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+            }
+
+            if ( workOrder3DT->embedded_ref_fillback_ )
+            {
+                ref_fillback = true;
+            }
+
+            if ( workOrder3DT->embedded_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder3DT->embedded_same_combinationcoeff_allN_ )
+            {
+                same_coilmap_allN = true;
+                whichN_coilmap = workOrder3DT->embedded_whichN_combinationcoeff_;
+            }
+        }
+
+        if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+        {
+            if ( workOrder3DT->separate_fullres_coilmap_ )
+            {
+                fullres_coilmap = true;
+            }
+
+            if ( workOrder3DT->separate_averageall_ref_ )
+            {
+                averageallN_coilmap = true;
+            }
+
+            if ( workOrder3DT->separate_same_combinationcoeff_allN_ )
+            {
+                same_coilmap_allN = true;
+                whichN_coilmap = workOrder3DT->separate_whichN_combinationcoeff_;
+            }
+        }
+
+        if ( whichN_coilmap >= N ) whichN_coilmap = N-1;
+
+        if ( ref_fillback )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("ref fill back ... "));
+
+            hoNDArray<T> ref_dst;
+            if ( workOrder3DT->coil_compression_ && workOrder3DT->recon_algorithm_!=ISMRMRD_SPIRIT && workOrder3DT->recon_algorithm_!=ISMRMRD_L1SPIRIT )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.applyKLCoilCompressionCoeff(workOrder3DT->ref_, *workOrder3DT->coilCompressionCoef_, ref_dst, true));
+            }
+            else
+            {
+                ref_dst = workOrder3DT->ref_;
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ref_dst, "ref_dst");
+
+            if ( (ref_dst.get_size(3)==dstCHA) && (ref_dst.get_size(4)==N) )
+            {
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->fullkspace_, "fullkspace_");
+
+                GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2(ref_dst, workOrder3DT->fullkspace_, 0, RO-1, startE1_, endE1_, startE2_, endE2_));
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->fullkspace_, "fullkspace_After");
+            }
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+        }
+
+        // partial fourier handling
+        GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder3DT));
+
+        if ( this->computeKSpace(workOrder3DT) || fullres_coilmap )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : allocate buffer 3DT ...  "));
+            hoNDArrayMemoryManaged<T> buffer3DT(workOrder3DT->fullkspace_.get_dimensions(), gtPlus_mem_manager_);
+            hoNDArrayMemoryManaged<T> buffer3DT_Two(workOrder3DT->fullkspace_.get_dimensions(), gtPlus_mem_manager_);
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : go to image domain ...  "));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->fullkspace_, buffer3DT, buffer3DT_Two));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT, "ComplexIm_afterRefFill");
+
+            if ( averageallN_coilmap )
+            {
+                if ( workOrder3DT->workFlow_use_BufferedKernel_ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "complexImCombined");
+                }
+                else
+                {
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : allocate coil map ...  "));
+                    workOrder3DT->coilMap_->create(RO, E1, E2, dstCHA, 1);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                    if ( N > 1 )
+                    {
+                        hoNDArray<T> aveComplexIm(RO, E1, E2, dstCHA, 1);
+                        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(buffer3DT, aveComplexIm));
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aveComplexIm, "aveComplexIm");
+
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : compute 3D coil map ...  "));
+                        if ( workOrder3DT->csm_use_gpu_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(aveComplexIm, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(aveComplexIm, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->coilMap_, "coilMap_fullres");
+                    }
+                    else
+                    {
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : compute 3D coil map ...  "));
+                        if ( workOrder3DT->csm_use_gpu_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->coilMap_, "coilMap_fullres");
+                    }
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("full res coil map : coil combine 3D ...  "));
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "complexImCombined");
+                }
+            }
+            else
+            {
+                if ( workOrder3DT->workFlow_use_BufferedKernel_ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "complexIm_");
+                }
+                else
+                {
+                    workOrder3DT->coilMap_->create(RO, E1, E2, dstCHA, N);
+
+                    if ( same_coilmap_allN )
+                    {
+                        hoNDArray<T> complexImN(RO, E1, E2, dstCHA, buffer3DT.begin()+whichN_coilmap*RO*E1*E2*dstCHA);
+                        hoNDArray<T> coilMapN(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+whichN_coilmap*RO*E1*E2*dstCHA);
+
+                        if ( workOrder3DT->csm_use_gpu_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(complexImN, coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(complexImN, coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->coilMap_, whichN_coilmap));
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->coilMap_, "coilMap_fullres");
+                    }
+                    else
+                    {
+                        if ( workOrder3DT->csm_use_gpu_ )
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        else
+                        {
+                            GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                        }
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->coilMap_, "coilMap_fullres");
+                    }
+
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_));
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "complexIm_");
+                }
+            }
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(workOrder3DT->complexIm_.get_size(0)==RO);
+            GADGET_CHECK_RETURN_FALSE(workOrder3DT->complexIm_.get_size(1)==E1);
+            GADGET_CHECK_RETURN_FALSE(workOrder3DT->complexIm_.get_size(2)==E2);
+        }
+
+        // flip along E2
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("flip along E2 ...  "));
+
+        size_t imRO = workOrder3DT->complexIm_.get_size(0);
+        size_t imE1 = workOrder3DT->complexIm_.get_size(1);
+        size_t imE2 = workOrder3DT->complexIm_.get_size(2);
+        size_t imCHA = workOrder3DT->complexIm_.get_size(3);
+
+        hoNDArrayMemoryManaged<T> complexIm(workOrder3DT->complexIm_, gtPlus_mem_manager_);
+
+        T* pSrc = workOrder3DT->complexIm_.begin();
+        T* pDst = complexIm.begin();
+
+        size_t mid_RO = imRO/2;
+        size_t mid_E1 = imE1/2;
+        size_t mid_E2 = imE2/2;
+
+        size_t n, cha;
+        for ( n=0; n<workOrder3DT->complexIm_.get_size(4); n++ )
+        {
+            for ( cha=0; cha<imCHA; cha++ )
+            {
+                int offset = n*imRO*imE1*imE2*imCHA+cha*imRO*imE1*imE2;
+
+                for ( int e2=0; e2<(int)imE2; e2++ )
+                {
+                    int e2_from = 2*mid_E2-e2;
+                    if ( e2_from >= imE2 ) e2_from -= imE2;
+
+                    memcpy(pDst+offset+e2*imRO*imE1, pSrc+offset+e2_from*imRO*imE1, sizeof(T)*imRO*imE1);
+
+                    //for ( int e1=0; e1<(int)imE1; e1++ )
+                    //{
+                    //    int e1_from = 2*mid_E1-e1;
+                    //    if ( e1_from >= imE1 ) e1_from -= imE1;
+
+                    //    for ( int ro=0; ro<(int)imRO; ro++ )
+                    //    {
+                    //        int ro_from = 2*mid_RO-ro;
+                    //        if ( ro_from >= imRO ) ro_from -= imRO;
+
+                    //        pDst[offset+e2*imRO*imE1+e1*imRO+ro] = pSrc[offset+e2_from*imRO*imE1+e1_from*imRO+ro_from];
+                    //    }
+                    //}
+                }
+            }
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+        workOrder3DT->complexIm_ = complexIm;
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::afterUnwrapping(WorkOrderType* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierHandling(WorkOrderType* workOrder3DT)
+{
+    try
+    {
+        if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING ) return true;
+
+        if ( workOrder3DT->acceFactorE1_==1 && workOrder3DT->acceFactorE2_==1 )
+        {
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder3DT, workOrder3DT->data_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder3DT, workOrder3DT->data_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder3DT, workOrder3DT->data_));
+            }
+        }
+        else if ( workOrder3DT->fullkspace_.get_number_of_elements() > 0 )
+        {
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder3DT, workOrder3DT->fullkspace_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder3DT, workOrder3DT->fullkspace_));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder3DT, workOrder3DT->fullkspace_));
+            }
+        }
+        else
+        {
+            // perform partial fourier handling on the complex images after coil combination
+            hoNDArrayMemoryManaged<T> kspace(workOrder3DT->complexIm_.get_dimensions(), gtPlus_mem_manager_);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(workOrder3DT->complexIm_, kspace));
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_ZEROFILLING_FILTER )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFilter(*workOrder3DT, kspace));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_POCS )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierPOCSRecon(*workOrder3DT, kspace));
+            }
+
+            if ( workOrder3DT->partialFourier_algo_ == ISMRMRD_PF_FENGHUANG )
+            {
+                GADGET_CHECK_RETURN_FALSE(performPartialFourierFengHuangRecon(*workOrder3DT, kspace));
+            }
+
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspace, workOrder3DT->complexIm_));
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performPartialFourierHandling(gtPlusReconworkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder3DT<T>& workOrder3DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        GADGET_MSG("--> Into gt Plus 3DT partial fourier filter ... ");
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        // check whether partial fourier is used
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0 || (workOrder3DT.end_RO_-workOrder3DT.start_RO_+1==RO) ) 
+            && (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0 || (workOrder3DT.end_E1_-workOrder3DT.start_E1_+1==E1) )
+            && (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0 || (workOrder3DT.end_E2_-workOrder3DT.start_E2_+1==E2) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_PF_Filter");
+
+        hoNDArrayMemoryManaged<T> buffer3DT_partial_fourier(kspace.get_dimensions(), gtPlus_mem_manager_);
+
+        if ( workOrder3DT.filterROE1E2_partialfourier_.get_size(0)==RO 
+                && workOrder3DT.filterROE1E2_partialfourier_.get_size(1)==E1
+                && workOrder3DT.filterROE1E2_partialfourier_.get_size(2)==E2 )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(kspace, workOrder3DT.filterROE1E2_partialfourier_, buffer3DT_partial_fourier));
+            kspace = buffer3DT_partial_fourier;
+        }
+
+        else if ( (workOrder3DT.filterRO_partialfourier_.get_number_of_elements() == RO) 
+                && (workOrder3DT.filterE1_partialfourier_.get_number_of_elements() == E1) 
+                && (workOrder3DT.filterE2_partialfourier_.get_number_of_elements() == E2) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(kspace, workOrder3DT.filterRO_partialfourier_, 
+                    workOrder3DT.filterE1_partialfourier_, workOrder3DT.filterE2_partialfourier_, buffer3DT_partial_fourier));
+
+            kspace = buffer3DT_partial_fourier;
+        }
+
+        else
+        {
+            hoNDArray<T>* pSrc = &kspace;
+            hoNDArray<T>* pDst = &buffer3DT_partial_fourier;
+
+            bool filterPerformed = false;
+
+            if ( workOrder3DT.filterRO_partialfourier_.get_number_of_elements() == RO )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterRO(*pSrc, workOrder3DT.filterRO_partialfourier_, *pDst));
+                std::swap(pSrc, pDst);
+                filterPerformed = true;
+            }
+
+            if ( workOrder3DT.filterE1_partialfourier_.get_number_of_elements() == E1 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspacefilterE1(*pSrc, workOrder3DT.filterE1_partialfourier_, *pDst));
+                std::swap(pSrc, pDst);
+                filterPerformed = true;
+            }
+
+            if ( workOrder3DT.filterE2_partialfourier_.get_number_of_elements() == E2 )
+            {
+                GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtil<T>().kspace3DfilterE2(*pSrc, workOrder3DT.filterE2_partialfourier_, *pDst));
+                std::swap(pSrc, pDst);
+                filterPerformed = true;
+            }
+
+            if ( filterPerformed && pDst != &kspace )
+            {
+                kspace = *pDst;
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_PF_Filter");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performPartialFourierFilter(gtPlusReconWorkOrder3DT<T>& workOrder3DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierPOCSRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        GADGET_MSG("--> Into gt Plus 3DT partial fourier POCS ... ");
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+        size_t CHA = kspace.get_size(3);
+        size_t N = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0 || (workOrder3DT.end_RO_-workOrder3DT.start_RO_+1==RO) ) 
+            && (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0 || (workOrder3DT.end_E1_-workOrder3DT.start_E1_+1==E1) )
+            && (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0 || (workOrder3DT.end_E2_-workOrder3DT.start_E2_+1==E2) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_POCS");
+
+        // create kspace filter for homodyne phase estimation
+        ISMRMRDKSPACEFILTER filter_ref_type_ = ISMRMRD_FILTER_HANNING;
+        double filter_ref_sigma_ = 1.5;
+        double filter_ref_width_ = 0.15;
+
+        int startRO(0), endRO(RO-1);
+        hoNDArray<T> filterRO(RO);
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, 0, RO-1, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*RO)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(RO, workOrder3DT.start_RO_, workOrder3DT.end_RO_, 
+                filterRO, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*RO)));
+
+            startRO = workOrder3DT.start_RO_;
+            endRO = workOrder3DT.end_RO_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterRO, "filterRO_POCS");
+
+        int startE1(0), endE1(E1-1);
+        hoNDArray<T> filterE1(E1);
+        if ( (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, 0, E1-1, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E1)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E1, workOrder3DT.start_E1_, workOrder3DT.end_E1_, 
+                filterE1, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E1)));
+
+            startE1 = workOrder3DT.start_E1_;
+            endE1 = workOrder3DT.end_E1_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterE1, "filterE1_POCS");
+
+        int startE2(0), endE2(E2-1);
+        hoNDArray<T> filterE2(E1);
+        if ( (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0) )
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E2, 0, E2-1, 
+                filterE2, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E2)));
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.generateSymmetricFilterForRef(E2, workOrder3DT.start_E2_, workOrder3DT.end_E2_, 
+                filterE2, filter_ref_type_, filter_ref_sigma_, std::ceil(filter_ref_width_*E2)));
+
+            startE2 = workOrder3DT.start_E2_;
+            endE2 = workOrder3DT.end_E2_;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, filterE2, "filterE2_POCS");
+
+        hoNDArrayMemoryManaged<T> kspaceIter(kspace.get_dimensions(), gtPlus_mem_manager_);
+        kspaceIter = kspace;
+
+        // magnitude of complex images
+        hoNDArrayMemoryManaged<typename realType<T>::Type> mag(kspace.get_dimensions(), gtPlus_mem_manager_);
+        hoNDArrayMemoryManaged<T> magComplex(kspace.get_dimensions(), gtPlus_mem_manager_);
+
+        hoNDArrayMemoryManaged<T> buffer3DT(kspace.get_dimensions(), gtPlus_mem_manager_);
+        hoNDArrayMemoryManaged<T> buffer3DT_partial_fourier(kspace.get_dimensions(), gtPlus_mem_manager_);
+
+        // kspace filter
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.kspace3DfilterROE1E2(kspaceIter, filterRO, filterE1, filterE2, buffer3DT_partial_fourier));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT_partial_fourier, "POCS_afterFiltered");
+
+        // go to image domain
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(buffer3DT_partial_fourier));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT_partial_fourier, "POCS_afterFiltered_complexIm");
+
+        // get the complex image phase for the filtered kspace
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(buffer3DT_partial_fourier, mag));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::addEpsilon(mag));
+        GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::divide(buffer3DT_partial_fourier, magComplex, buffer3DT));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT, "POCS_afterFiltered_complexIm_phase");
+
+        // complex images, initialized as not filtered complex image
+        hoNDArrayMemoryManaged<T> complexIm(kspaceIter, gtPlus_mem_manager_);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspaceIter, complexIm));
+        hoNDArrayMemoryManaged<T> complexImPOCS(complexIm, gtPlus_mem_manager_);
+
+        // the kspace during iteration is buffered here
+        hoNDArrayMemoryManaged<T> buffer3DT_partial_fourierkspaceIter(kspaceIter, gtPlus_mem_manager_);
+
+        size_t ii;
+        for ( ii=0; ii<workOrder3DT.partialFourier_POCS_iters_; ii++ )
+        {
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::absolute(complexImPOCS, mag));
+            GADGET_CHECK_RETURN_FALSE(magComplex.copyFrom(mag));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::multiply(magComplex, buffer3DT, complexImPOCS));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImPOCS, "POCS_complexImPOCS");
+
+            // go back to kspace
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(complexImPOCS, kspaceIter));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "POCS_kspaceIter");
+
+            // buffer kspace during iteration
+            buffer3DT_partial_fourierkspaceIter = kspaceIter;
+
+            // restore the acquired region
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2(kspace, kspaceIter, startRO, endRO, startE1, endE1, startE2, endE2));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIter, "POCS_kspaceIter_copyOri");
+
+            // update complex image
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(kspaceIter, complexImPOCS));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImPOCS, "POCS_kspaceIter_copyOri_complexImPOCS");
+
+            // compute threshold to stop the iteration
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::subtract(complexImPOCS, complexIm, buffer3DT_partial_fourier));
+            typename realType<T>::Type diff, prev;
+            Gadgetron::norm2(complexIm, prev);
+            Gadgetron::norm2(buffer3DT_partial_fourier, diff);
+
+            typename realType<T>::Type thres = diff/prev;
+
+            if ( !debugFolder_.empty() )
+            {
+                GADGET_MSG("POCS iter : " << ii << " - thres : " << thres << " ... ");
+            }
+
+            if ( thres < workOrder3DT.partialFourier_POCS_thres_ )
+            {
+                break;
+            }
+
+            complexIm = complexImPOCS;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT_partial_fourierkspaceIter, "kspaceIter_after_POCS");
+
+        if ( workOrder3DT.partialFourier_POCS_transitBand_ == 0 )
+        {
+            kspace = kspaceIter;
+        }
+        else
+        {
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2TransitionBand(kspace, buffer3DT_partial_fourierkspaceIter, startRO, endRO, startE1, endE1, startE2, endE2, 
+                workOrder3DT.partialFourier_POCS_transitBand_, workOrder3DT.partialFourier_POCS_transitBand_, workOrder3DT.partialFourier_POCS_transitBand_E2_));
+
+            kspace = buffer3DT_partial_fourierkspaceIter;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_POCS");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performPartialFourierPOCSRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performPartialFourierFengHuangRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace)
+{
+    try
+    {
+        GADGET_MSG("--> Into gt Plus 3DT partial fourier FengHuang ... ");
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+        size_t CHA = kspace.get_size(3);
+        size_t N = kspace.get_size(4);
+
+        // check whether partial fourier is used
+        if ( (workOrder3DT.start_RO_<0 || workOrder3DT.end_RO_<0 || (workOrder3DT.end_RO_-workOrder3DT.start_RO_+1==RO) ) 
+            && (workOrder3DT.start_E1_<0 || workOrder3DT.end_E1_<0 || (workOrder3DT.end_E1_-workOrder3DT.start_E1_+1==E1) )
+            && (workOrder3DT.start_E2_<0 || workOrder3DT.end_E2_<0 || (workOrder3DT.end_E2_-workOrder3DT.start_E2_+1==E2) ) )
+        {
+            return true;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_before_FengHuang");
+
+        int startRO(0), endRO(RO-1);
+        if ( workOrder3DT.start_RO_>=0 && workOrder3DT.end_RO_<RO )
+        {
+            startRO = workOrder3DT.start_RO_;
+            endRO = workOrder3DT.end_RO_;
+        }
+
+        int startE1(0), endE1(E1-1);
+        if ( workOrder3DT.start_E1_>=0 && workOrder3DT.end_E1_<E1 )
+        {
+            startE1 = workOrder3DT.start_E1_;
+            endE1 = workOrder3DT.end_E1_;
+        }
+
+        int startE2(0), endE2(E2-1);
+        if ( workOrder3DT.start_E2_>=0 && workOrder3DT.end_E2_<E2 )
+        {
+            startE2 = workOrder3DT.start_E2_;
+            endE2 = workOrder3DT.end_E2_;
+        }
+
+        // compute the conjugate symmetric kspace
+        hoNDArrayMemoryManaged<T> buffer3DT(kspace.get_dimensions(), gtPlus_mem_manager_);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("conjugateSymmetry3D"));
+        GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().conjugateSymmetry3D(kspace, buffer3DT));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT, "kspaceConj_FengHuang");
+
+        // find the symmetric region in the kspace
+        size_t startSymRO, endSymRO;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startRO, endRO, RO/2, startSymRO, endSymRO));
+
+        size_t startSymE1, endSymE1;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startE1, endE1, E1/2, startSymE1, endSymE1));
+
+        size_t startSymE2, endSymE2;
+        GADGET_CHECK_RETURN_FALSE(gtPlus_util_.findSymmetricSampledRegion(startE2, endE2, E2/2, startSymE2, endSymE2));
+
+        // the reference kspace for kernel estimation
+        hoNDArray<T> src, dst;
+        std::vector<size_t> start(5), size(5);
+
+        start[0] = startSymRO;
+        start[1] = startSymE1;
+        start[2] = startSymE2;
+        start[3] = 0;
+        start[4] = 0;
+
+        size[0] = endSymRO-startSymRO+1;
+        size[1] = endSymE1-startSymE1+1;
+        size[2] = endSymE2-startSymE2+1;;
+        size[3] = CHA;
+        size[4] = N;
+
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::cropUpTo10DArray(buffer3DT, src, start, size));
+        GADGET_CHECK_RETURN_FALSE(cropUpTo10DArray(kspace, dst, start, size));
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, src, "src_FengHuang");
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, dst, "dst_FengHuang");
+
+        if ( workOrder3DT.partialFourier_FengHuang_sameKernel_allN_ )
+        {
+            hoNDArray<T> ave4D;
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(src, ave4D));
+            src = ave4D;
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.averageKSpace5D(dst, ave4D));
+            dst = ave4D;
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, src, "src_ave4D_FengHuang");
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, dst, "dst_ave4D_FengHuang");
+        }
+
+        // estimate the kernels
+        ho6DArray<T> kernel; // [RO E1 E2 srcCHA 1 N]
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("calibFengHuang"));
+        GADGET_CHECK_RETURN_FALSE(this->calibFengHuang(workOrder3DT, src, dst, kernel));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        // perform the recon
+        if ( workOrder3DT.partialFourier_FengHuang_transitBand_==0 )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performReconFangHuang"));
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder3DT, buffer3DT, kspace, startRO, endRO, startE1, endE1, startE2, endE2, kernel));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+        else
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("performReconFangHuang with transition band"));
+
+            int tb =  (int)workOrder3DT.partialFourier_FengHuang_transitBand_;
+
+            int sRO(startRO), eRO(endRO), sE1(startE1), eE1(endE1), sE2(startE2), eE2(endE2);
+
+            if ( startRO > 0 )
+            {
+                startRO += tb;
+                if ( startRO > RO ) startRO = 0;
+            }
+
+            if ( endRO < RO-1 )
+            {
+                endRO -= tb;
+                if ( endRO < 0 ) endRO = RO-1;
+            }
+
+            if ( startRO > endRO )
+            {
+                startRO = 0;
+                endRO = RO-1;
+            }
+
+            if ( startE1 > 0 )
+            {
+                startE1 += tb;
+                if ( startE1 > E1 ) startE1 = 0;
+            }
+
+            if ( endE1 < E1-1 )
+            {
+                endE1 -= tb;
+                if ( endE1 < 0 ) endE1 = E1-1;
+            }
+
+            if ( startE1 > endE1 )
+            {
+                startE1 = 0;
+                endE1 = E1-1;
+            }
+
+            if ( startE2 > 0 )
+            {
+                startE2 += tb;
+                if ( startE2 > E2 ) startE2 = 0;
+            }
+
+            if ( endE2 < E2-1 )
+            {
+                endE2 -= tb;
+                if ( endE2 < 0 ) endE2 = E2-1;
+            }
+
+            if ( startE2 > endE2 )
+            {
+                startE2 = 0;
+                endE2 = E2-1;
+            }
+
+            hoNDArrayMemoryManaged<T> buffer3DT_partial_fourier_kspaceIter(kspace.get_dimensions(), gtPlus_mem_manager_);
+            GADGET_CHECK_RETURN_FALSE(this->performReconFangHuang(workOrder3DT, buffer3DT, 
+                    buffer3DT_partial_fourier_kspaceIter, startRO, endRO, startE1, endE1, startE2, endE2, kernel));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, buffer3DT_partial_fourier_kspaceIter, "kspace_FengHuang_recon");
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_FengHuang_original");
+
+            GADGET_CHECK_RETURN_FALSE(gtPlus_util_.copyAlongROE1E2TransitionBand(kspace, buffer3DT_partial_fourier_kspaceIter, 
+                    sRO, eRO, sE1, eE1, sE2, eE2, workOrder3DT.partialFourier_FengHuang_transitBand_, 
+                    workOrder3DT.partialFourier_FengHuang_transitBand_, workOrder3DT.partialFourier_FengHuang_transitBand_E2_));
+
+            kspace = buffer3DT_partial_fourier_kspaceIter;
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace_after_FengHuang");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performPartialFourierFengHuangRecon(WorkOrderType& workOrder3DT, hoNDArray<T>& kspace) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::calibFengHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(src.dimensions_equal(&dst));
+
+        size_t RO = src.get_size(0);
+        size_t E1 = src.get_size(1);
+        size_t E2 = src.get_size(2);
+        size_t srcCHA = src.get_size(3);
+        size_t N = src.get_size(4);
+
+        size_t kx = workOrder3DT.partialFourier_FengHuang_kSize_RO_;
+        size_t ky = workOrder3DT.partialFourier_FengHuang_kSize_E1_;
+        size_t kz = workOrder3DT.partialFourier_FengHuang_kSize_E2_;
+
+        if ( kx%2 == 0 ) kx++;
+        if ( ky%2 == 0 ) ky++;
+        if ( kz%2 == 0 ) kz++;
+
+        int halfKx = (int)kx/2;
+        int halfKy = (int)ky/2;
+        int halfKz = (int)kz/2;
+
+        // the cross-channel kernel is not estimated
+        kernel.createArray(kx, ky, kz, srcCHA, 1, N);
+
+        int ii=0;
+        int num = N*srcCHA;
+
+        int startRO = halfKx;
+        int endRO = RO - halfKx - 1;
+
+        int startE1 = halfKy;
+        int endE1 = E1 - halfKy - 1;
+
+        int startE2 = halfKz;
+        int endE2 = E2 - halfKz - 1;
+
+        int rowA, colA, rowB, colB;
+        rowA = (endE2-startE2+1)*(endE1-startE1+1)*(endRO-startRO+1); 
+        colA = kx*ky*kz;
+
+        rowB = rowA;
+        colB = 1;
+
+        double thresReg = workOrder3DT.partialFourier_FengHuang_thresReg_;
+
+        #ifdef USE_OMP
+            omp_set_nested(1);
+        #endif // USE_OMP
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, E2, srcCHA, N, kx, ky, kz, rowA, colA, rowB, colB, startRO, endRO, startE1, endE1, startE2, endE2, halfKx, halfKy, halfKz, thresReg) if ( num > 1 ) num_threads( (num<16 ? num : 16) )
+        #else
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, E2, srcCHA, N, kx, ky, kz, src, dst, kernel, rowA, colA, rowB, colB, startRO, endRO, startE1, endE1, startE2, endE2, halfKx, halfKy, halfKz, thresReg) if ( num > 1 ) num_threads( (num<16 ? num : 16) )
+        #endif
+        {
+           /* hoNDArrayMemoryManaged<T> A_mem(colA, rowA, gtPlus_mem_manager_);
+            hoNDArrayMemoryManaged<T> B_mem(colB, rowB, gtPlus_mem_manager_);
+            hoNDArrayMemoryManaged<T> K_mem(colB, colA, gtPlus_mem_manager_);*/
+
+            hoNDArray<T> A_mem(rowA, colA);
+            hoNDArray<T> B_mem(rowB, colB);
+            hoNDArray<T> K_mem(colA, colB);
+
+            hoMatrix<T> A(rowA, colA, A_mem.begin());
+            hoMatrix<T> B(rowB, colB, B_mem.begin());
+            hoMatrix<T> K(colA, colB, K_mem.begin());
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                ho3DArray<T> src3D(RO, E1, E2, const_cast<T*>(src.begin())+ii*RO*E1*E2);
+                ho3DArray<T> dst3D(RO, E1, E2, const_cast<T*>(dst.begin())+ii*RO*E1*E2);
+
+                size_t ro, e1, e2, row(0);
+                int x, y, z;
+
+                for ( e2=startE2; e2<=endE2; e2++ )
+                {
+                    for ( e1=startE1; e1<=endE1; e1++ )
+                    {
+                        for ( ro=startRO; ro<=endRO; ro++ )
+                        {
+
+                            size_t colInd(0);
+                            for ( z=-halfKz; z<=halfKz; z++ )
+                            {
+                                for ( y=-halfKy; y<=halfKy; y++ )
+                                {
+                                    for ( x=-halfKx; x<=halfKx; x++ )
+                                    {
+                                        A(row, colInd++) = src3D(ro+x, e1+y, e2+z);
+                                    }
+                                }
+                            }
+
+                            B(row, 0) = dst3D(ro, e1, e2);
+
+                            row++;
+                        }
+                    }
+                }
+
+                Gadgetron::SolveLinearSystem_Tikhonov(A, B, K, thresReg);
+
+                memcpy(kernel.begin()+ii*kx*ky*kz, K.begin(), sizeof(T)*kx*ky*kz);
+            }
+        }
+
+        #ifdef USE_OMP
+            omp_set_nested(0);
+        #endif // USE_OMP
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::calibFengHuang(WorkOrderType& workOrder3DT, const hoNDArray<T>& src, const hoNDArray<T>& dst, ho6DArray<T>& kernel) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::performReconFangHuang(WorkOrderType& workOrder3DT, 
+                                                const hoNDArray<T>& kspaceConj, hoNDArray<T>& kspace, 
+                                                int startRO, int endRO, int startE1, int endE1, 
+                                                int startE2, int endE2, ho6DArray<T>& kernel)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(kspaceConj.dimensions_equal(&kspace));
+
+        short RO = (short)kspace.get_size(0);
+        short E1 = (short)kspace.get_size(1);
+        short E2 = (short)kspace.get_size(2);
+        size_t CHA = kspace.get_size(3);
+        size_t N = kspace.get_size(4);
+
+        size_t kx = kernel.get_size(0);
+        size_t ky = kernel.get_size(1);
+        size_t kz = kernel.get_size(2);
+
+        int halfKx = kx/2;
+        int halfKy = ky/2;
+        int halfKz = kz/2;
+
+        size_t kerN = kernel.get_size(5);
+        GADGET_CHECK_RETURN_FALSE( (kerN==1) || (kerN==N) );
+
+        int num = CHA*N;
+
+        long long rowD = RO*E1*E2 - ( (endE2-startE2+1) * (endE1-startE1+1) * (endRO-startRO+1) );
+        int colD = kx*ky*kz;
+
+        ho2DArray<short> coeffX(colD, rowD);
+        short* pCx = coeffX.begin();
+
+        ho2DArray<short> coeffY(colD, rowD);
+        short* pCy = coeffY.begin();
+
+        ho2DArray<short> coeffZ(colD, rowD);
+        short* pCz = coeffZ.begin();
+
+        short ro, e1, e2;
+        long long row(0);
+        int x, y, z, dx, dy, dz;
+
+        ho2DArray<short> rowInd(3, rowD);
+        short* pRowInd = rowInd.begin();
+
+        hoNDArray<short> offsetX(colD);
+        short* pOffsetX = offsetX.begin();
+
+        hoNDArray<short> offsetY(colD);
+        short* pOffsetY = offsetY.begin();
+
+        hoNDArray<short> offsetZ(colD);
+        short* pOffsetZ = offsetZ.begin();
+
+        int colInd(0);
+        for ( z=-halfKz; z<=halfKz; z++ )
+        {
+            for ( y=-halfKy; y<=halfKy; y++ )
+            {
+                for ( x=-halfKx; x<=halfKx; x++ )
+                {
+                    offsetX(colInd) = x;
+                    offsetY(colInd) = y;
+                    offsetZ(colInd) = z;
+                    colInd++;
+                }
+            }
+        }
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("performReconFangHuang - compute coeff array"));
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("performReconFangHuang - compute coeff array - internal"));
+
+        short* pRowIndCurr;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    if ( (ro>=startRO) && (ro<=endRO) && (e1>=startE1) && (e1<=endE1) && (e2>=startE2) && (e2<=endE2) )
+                    {
+                        continue;
+                    }
+
+                    pRowIndCurr = pRowInd + row*3;
+
+                    pRowIndCurr[0] = ro;
+                    pRowIndCurr[1] = e1;
+                    pRowIndCurr[2] = e2;
+
+                    row++;
+                }
+            }
+        }
+
+        long long r;
+        #pragma omp parallel for default(none) private(r) shared(rowD, colD, pCx, pCy, pCz, pRowInd, pRowIndCurr, pOffsetX, pOffsetY, pOffsetZ)
+        for ( r=0; r<rowD; r++ )
+        {
+            long long offsetC = r*colD;
+            pRowIndCurr = pRowInd + r*3;
+
+            for ( int colInd=0; colInd<colD; colInd++ )
+            {
+                pCx[offsetC+colInd] = pRowIndCurr[0]+pOffsetX[colInd];
+                pCy[offsetC+colInd] = pRowIndCurr[1]+pOffsetY[colInd];
+                pCz[offsetC+colInd] = pRowIndCurr[2]+pOffsetZ[colInd];
+            }
+        }
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+        #pragma omp parallel for default(none) private(r) shared(rowD, colD, pCx, pCy, pCz, RO, E1, E2)
+        for ( r=0; r<rowD; r++ )
+        {
+            for ( int c=0; c<colD; c++ )
+            {
+                long long offset = c + r*colD;
+
+                //pCx[offset] += pOffsetX[c];
+
+                if ( pCx[offset] < 0 )
+                {
+                    pCx[offset] += RO;
+                }
+                else if ( pCx[offset] > RO-1 )
+                {
+                    pCx[offset] -= RO;
+                }
+
+                //pCy[offset] += pOffsetY[c];
+
+                if ( pCy[offset] < 0 )
+                {
+                    pCy[offset] += E1;
+                }
+                else if ( pCy[offset] > E1-1 )
+                {
+                    pCy[offset] -= E1;
+                }
+
+                //pCz[offset] += pOffsetZ[c];
+
+                if ( pCz[offset] < 0 )
+                {
+                    pCz[offset] += E2;
+                }
+                else if ( pCz[offset] > E2-1 )
+                {
+                    pCz[offset] -= E2;
+                }
+            }
+        }
+
+        /*row = 0;
+        for ( e2=0; e2<E2; e2++ )
+        {
+            for ( e1=0; e1<E1; e1++ )
+            {
+                for ( ro=0; ro<RO; ro++ )
+                {
+                    if ( (ro>=startRO) && (ro<=endRO) && (e1>=startE1) && (e1<=endE1) && (e2>=startE2) && (e2<=endE2) )
+                    {
+                        continue;
+                    }
+
+                    size_t colInd(0);
+
+                    for ( z=-halfKz; z<=halfKz; z++ )
+                    {
+                        dz = e2 + z;
+                        if ( dz < 0 ) dz += E2;
+                        if ( dz > E2-1 ) dz -= E2;
+
+                        for ( y=-halfKy; y<=halfKy; y++ )
+                        {
+                            dy = e1 + y;
+                            if ( dy < 0 ) dy += E1;
+                            if ( dy > E1-1 ) dy -= E1;
+
+                            for ( x=-halfKx; x<=halfKx; x++ )
+                            {
+                                dx = ro + x;
+                                if ( dx < 0 ) dx += RO;
+                                if ( dx > RO-1 ) dx -= RO;
+
+                                coeffX(row, colInd) = dx;
+                                coeffY(row, colInd) = dy;
+                                coeffZ(row, colInd) = dz;
+                                colInd++;
+                            }
+                        }
+                    }
+
+                    row++;
+                }
+            }
+        }*/
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        int ii;
+        int numOfThreads = ((num>4) ? 4 : num);
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, E2, CHA, N, kerN, rowD, colD, coeffX, coeffY, coeffZ, pCx, pCy, pCz) if ( num > 1 ) num_threads( numOfThreads ) 
+        #else
+            #pragma omp parallel default(none) private(ii) shared(num, RO, E1, E2, CHA, N, kerN, kspaceConj, kspace, kernel, rowD, colD, coeffX, coeffY, coeffZ, pCx, pCy, pCz) if ( num > 1 ) num_threads( numOfThreads )
+        #endif
+        {
+            hoNDArrayMemoryManaged<T> D_mem(rowD, colD, gtPlus_mem_manager_);
+
+            hoMatrix<T> D(rowD, colD, D_mem.begin());
+            T* pD = D.begin();
+
+            hoMatrix<T> K(colD, 1);
+            hoMatrix<T> R(rowD, 1);
+
+            #pragma omp for
+            for ( ii=0; ii<num; ii ++ )
+            {
+                ho3DArray<T> src3D(RO, E1, E2, const_cast<T*>(kspaceConj.begin())+ii*RO*E1*E2);
+                ho3DArray<T> dst3D(RO, E1, E2, kspace.begin()+ii*RO*E1*E2);
+
+                size_t ro, e1, e2;
+                long long row, col;
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("fill data matrix ... "));
+                #pragma omp parallel for private(row) shared(colD, rowD, D, src3D, pD)
+                for ( row=0; row<rowD; row++ )
+                {
+                    for ( int col=0; col<colD; col++ )
+                    {
+                        long long offset = col + row*colD;
+                        pD[offset] = src3D(pCx[offset], pCy[offset], pCz[offset]);
+                    }
+                }
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                if ( kerN == 1 )
+                {
+                    int ind = ii;
+                    int currS = ind/(CHA*N);
+                    ind %= CHA*N;
+                    int currN = ind/CHA;
+                    ind %= CHA;
+                    memcpy(K.begin(), kernel.begin()+(ind+currS*CHA)*colD, sizeof(T)*colD);
+                }
+                else
+                {
+                    memcpy(K.begin(), kernel.begin()+ii*colD, sizeof(T)*colD);
+                }
+
+                // R = D*K
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("matrix multiplication ... "));
+                Gadgetron::GeneralMatrixProduct_gemm(R, D, false, K, false);
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+
+                size_t colCenter = colD/2;
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.start("fill the result array ... "));
+                #pragma omp parallel for private(row) default(none) shared(rowD, dst3D, colCenter, coeffX, coeffY, coeffZ, R)
+                for ( row=0; row<rowD; row++ )
+                {
+                    dst3D( coeffX(colCenter, row), coeffY(colCenter, row), coeffZ(colCenter, row) ) = R(row, 0);
+                }
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer2_.stop());
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::performReconFangHuang(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DT<T>::
+estimateJobSize(gtPlusReconWorkOrder<T>* workOrder3DT, size_t maxNumOfBytesPerJob, size_t overlapBetweenJobs, size_t numOfNodes, size_t& jobSize)
+{
+    try
+    {
+        size_t nodeN = numOfNodes;
+        GADGET_CHECK_RETURN_FALSE(this->computeEffectiveNodeNumberBasedOnComputingPowerIndex(workOrder3DT, nodeN));
+        if ( workOrder3DT->job_perform_on_control_node_ ) nodeN++;
+
+        GADGET_MSG("GtPlus Cloud 3DT - job_perform_on_control_node is " << workOrder3DT->job_perform_on_control_node_  << " - nodeN is " << nodeN << " - overlapBetweenJobs is " << overlapBetweenJobs << " ... ");
+
+        // adjust jobN according to cloud size
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+        size_t totalJobNum = RO;
+        jobSize = std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+
+        size_t numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobSize + 2*E1*E2*srcCHA*jobSize );
+
+        // here a 64Mb graceful size is given to job
+        while ( numOfBytesPerJob > maxNumOfBytesPerJob-64.0*1024*1024 )
+        {
+            nodeN *= 2;
+            jobSize = std::ceil( (double)(totalJobNum+overlapBetweenJobs*(nodeN-1))/(double)nodeN );
+            numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobSize + 2*E1*E2*srcCHA*jobSize );
+        }
+
+        GADGET_MSG("GtPlus Cloud 3DT - jobSize is " << jobSize << "; every job has " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DT<T>::estimateJobSize(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
new file mode 100644
index 0000000..00a38bd
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTGRAPPA.h
@@ -0,0 +1,621 @@
+/** \file   gtPlusISMRMRDReconWorker3DTGRAPPA.h
+    \brief  Implement the 3DT GRAPPA reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker3DT.h"
+#include "gtPlusGRAPPA.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTGRAPPA : public gtPlusReconWorker3DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DT<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+
+    gtPlusReconWorker3DTGRAPPA() : BaseClass() {}
+    virtual ~gtPlusReconWorker3DTGRAPPA() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusGRAPPA<T> grappa_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    bool recon_kspace = false;
+
+    if ( workOrder3DT->CalibMode_ == ISMRMRD_embedded )
+    {
+        if ( workOrder3DT->embedded_fullres_coilmap_ || workOrder3DT->embedded_ref_fillback_ )
+        {
+            recon_kspace = true;
+        }
+    }
+
+    if ( workOrder3DT->CalibMode_ == ISMRMRD_separate )
+    {
+        if ( workOrder3DT->separate_fullres_coilmap_ )
+        {
+            recon_kspace = true;
+        }
+    }
+
+    if ( workOrder3DT->recon_kspace_needed_ )
+    {
+        recon_kspace = true;
+    }
+
+    return recon_kspace;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+
+    size_t srcCHA = workOrder3DT->kernel_->get_size(3);
+    size_t dstCHA = workOrder3DT->kernel_->get_size(4);
+
+    jobN = workOrder3DT->job_num_of_N_;
+    size_t jobMegaBytes = workOrder3DT->job_max_Megabytes_;
+
+    bool splitJobs = (jobN>0 && RO>jobN);
+    if ( !splitJobs )
+    {
+        if ( jobMegaBytes>0 )
+        {
+            size_t jobN = jobMegaBytes/(E1*E2*srcCHA*dstCHA*sizeof(T)/1024/1024);
+            if ( jobN < RO ) splitJobs = true;
+            GADGET_MSG("grappa - 3DT - size of largest job : " << jobN);
+        }
+    }
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+    if ( !reconKSpace )
+    {
+        splitJobs = false;
+    }
+
+    return splitJobs;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT)
+{
+    grappa_.performTiming_ = performTiming_;
+
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    std::vector<int> kE1, oE1;
+    bool fitItself = true;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE1, oE1, workOrder3DT->acceFactorE1_, workOrder3DT->grappa_kSize_E1_, fitItself));
+
+    std::vector<int> kE2, oE2;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE2, oE2, workOrder3DT->acceFactorE2_, workOrder3DT->grappa_kSize_E2_, fitItself));
+
+    size_t kRO = workOrder3DT->grappa_kSize_RO_;
+    size_t kNE1 = workOrder3DT->grappa_kSize_E1_;
+    size_t oNE1 = oE1.size();
+
+    size_t kNE2 = workOrder3DT->grappa_kSize_E2_;
+    size_t oNE2 = oE1.size();
+
+    workOrder3DT->kernel_->create(kRO, kNE1, kNE2, srcCHA, dstCHA, oNE1, oNE2, refN);
+    Gadgetron::clear(workOrder3DT->kernel_.get());
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("allocate image domain kernel ... "));
+        if ( gtPlus_mem_manager_ )
+        {
+            if ( workOrder3DT->kernelIm_->get_number_of_elements() != (size_t)RO*E1*E2*srcCHA*dstCHA*refN )
+            {
+                workOrder3DT->kernelIm_->create(RO, E1, E2, srcCHA, dstCHA, refN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*E1*E2*srcCHA*dstCHA*refN)));
+            }
+        }
+        else
+        {
+            workOrder3DT->kernelIm_->create(RO, E1, E2, srcCHA, dstCHA, refN);
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    else
+    {
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        int maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        int convKE2 = 2*maxKE2+1;
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("allocate image domain kernel only along RO ... "));
+        if ( gtPlus_mem_manager_ )
+        {
+            if ( workOrder3DT->kernelIm_->get_number_of_elements() != (size_t)RO*convKE1*convKE2*srcCHA*dstCHA*refN )
+            {
+                workOrder3DT->kernelIm_->create(convKE1, convKE2, RO, srcCHA, dstCHA, refN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*convKE1*convKE2*srcCHA*dstCHA*refN)));
+            }
+        }
+        else
+        {
+            workOrder3DT->kernelIm_->create(convKE1, convKE2, RO, srcCHA, dstCHA, refN);
+            // pre-set to zero is needed here
+            memset(workOrder3DT->kernelIm_->begin(), 0, workOrder3DT->kernelIm_->get_number_of_bytes());
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+
+    if ( !reconKSpace )
+    {
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("allocate unmixing coefficient ... "));
+        if ( gtPlus_mem_manager_ )
+        {
+            if ( workOrder3DT->unmixingCoeffIm_->get_number_of_elements() != (size_t)RO*E1*E2*srcCHA*refN )
+            {
+                workOrder3DT->unmixingCoeffIm_->create(RO, E1, E2, srcCHA, refN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*E1*E2*srcCHA*refN)));
+            }
+        }
+        else
+        {
+            workOrder3DT->unmixingCoeffIm_->create(RO, E1, E2, srcCHA, refN);
+        }
+        Gadgetron::clear(workOrder3DT->unmixingCoeffIm_.get());
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        workOrder3DT->gfactor_.create(RO, E1, E2, refN);
+        Gadgetron::clear(&(workOrder3DT->gfactor_));
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = workOrder3DT->data_.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    std::vector<int> kE1, oE1;
+    bool fitItself = true;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE1, oE1, workOrder3DT->acceFactorE1_, workOrder3DT->grappa_kSize_E1_, fitItself));
+
+    std::vector<int> kE2, oE2;
+    GADGET_CHECK_RETURN_FALSE(grappa_.kerPattern(kE2, oE2, workOrder3DT->acceFactorE2_, workOrder3DT->grappa_kSize_E2_, fitItself));
+
+    size_t kRO = workOrder3DT->grappa_kSize_RO_;
+    size_t kNE1 = workOrder3DT->grappa_kSize_E1_;
+    size_t oNE1 = oE1.size();
+
+    size_t kNE2 = workOrder3DT->grappa_kSize_E2_;
+    size_t oNE2 = oE1.size();
+
+    ho4DArray<T> acsSrc(refRO, refE1, refE2, srcCHA, const_cast<T*>(ref_src.begin()+usedN*refRO*refE1*refE2*srcCHA));
+    ho4DArray<T> acsDst(refRO, refE1, refE2, dstCHA, const_cast<T*>(ref_dst.begin()+usedN*refRO*refE1*refE2*dstCHA));
+
+    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsSrc, "acsSrc");
+    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsDst, "acsDst");
+
+    grappa_.calib_use_gpu_  = workOrder3DT->grappa_use_gpu_;
+
+    ho7DArray<T> ker(kRO, kNE1, kNE2, srcCHA, dstCHA, oNE1, oNE2, workOrder3DT->kernel_->begin()+usedN*kRO*kNE1*kNE2*srcCHA*dstCHA*oNE1*oNE2);
+    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D calibration ... "));
+    grappa_.calib3D(acsSrc, acsDst, workOrder3DT->grappa_reg_lamda_, workOrder3DT->grappa_calib_over_determine_ratio_, kRO, kE1, kE2, oE1, oE2, ker);
+    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ker, "ker");
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        hoNDArray<T> kIm(RO, E1, E2, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*RO*E1*E2*srcCHA*dstCHA);
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D image domain kernel ... "));
+        grappa_.imageDomainKernel3D(ker, kRO, kE1, kE2, oE1, oE2, RO, E1, E2, kIm);
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        if ( !reconKSpace )
+        {
+            hoNDArray<T> coilMap(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*dstCHA);
+            hoNDArray<T> unmixC(RO, E1, E2, srcCHA);
+            hoNDArray<T> gFactor(RO, E1, E2, workOrder3DT->gfactor_.begin()+usedN*RO*E1*E2);
+
+            this->unmixCoeff(kIm, coilMap, unmixC, gFactor);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::scal(1.0/workOrder3DT->acceFactorE1_/workOrder3DT->acceFactorE2_, gFactor));
+
+            memcpy(workOrder3DT->unmixingCoeffIm_->begin()+usedN*RO*E1*E2*srcCHA, unmixC.begin(), unmixC.get_number_of_bytes());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unmixC, "unmixC");
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, gFactor, "gFactor");
+        }
+    }
+    else
+    {
+        int maxKE1 = std::abs(kE1[0]);
+        if ( std::abs(kE1[kNE1-1]) > maxKE1 )
+        {
+            maxKE1 = std::abs(kE1[kNE1-1]);
+        }
+        int convKE1 = 2*maxKE1+1;
+
+        int maxKE2 = std::abs(kE2[0]);
+        if ( std::abs(kE2[kNE2-1]) > maxKE2 )
+        {
+            maxKE2 = std::abs(kE2[kNE2-1]);
+        }
+        int convKE2 = 2*maxKE2+1;
+
+        hoNDArray<T> kIm(convKE1, convKE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*convKE1*convKE2*RO*srcCHA*dstCHA);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D image domain kernel only along RO ... "));
+        GADGET_CHECK_RETURN_FALSE(grappa_.imageDomainKernelRO3D(ker, kRO, kE1, kE2, oE1, oE2, RO, kIm));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        if ( !debugFolder_.empty() )
+        {
+            hoNDArray<T> kImROACha(convKE1, convKE2, RO, srcCHA, kIm.begin());
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImROACha, "kImROACha");
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::
+performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        int n;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+        size_t refN = workOrder3DT->kernelIm_->get_size(5);
+
+        workOrder3DT->complexIm_.create(RO, E1, E2, 1, N);
+
+        hoNDArrayMemoryManaged<T> aliasedIm(gtPlus_mem_manager_);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D compute aliased image ... "));
+        if ( workOrder3DT->downstream_coil_compression_ )
+        {
+            aliasedIm.create(workOrder3DT->data_.get_dimensions());
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->data_, aliasedIm);
+        }
+        else
+        {
+            aliasedIm.create(data_dst.get_dimensions());
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(data_dst, aliasedIm);
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aliasedIm, "aliasedIm");
+
+        bool recon_kspace = this->computeKSpace(workOrder3DT);
+
+        // if kspace is actually needed
+        if ( recon_kspace )
+        {
+            workOrder3DT->fullkspace_ = data_dst;
+
+            size_t jobN;
+            bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+            if ( splitJobs )
+            {
+                size_t kE1 = workOrder3DT->kernelIm_->get_size(0);
+                size_t kE2 = workOrder3DT->kernelIm_->get_size(1);
+                size_t kRO = workOrder3DT->kernelIm_->get_size(2);
+
+                size_t usedN;
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> kImPermuted(kE1, kE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin());
+
+                    hoNDArray<T> kImPermutedJob(kE1, kE2, jobN, srcCHA, dstCHA);
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D allocate buffer for kImPermutedZeroFilledJob ... "));
+                    hoNDArrayMemoryManaged<T> kImPermutedZeroFilledJob(E1, E2, jobN, srcCHA, dstCHA, gtPlus_mem_manager_);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    // aliased images
+                    hoNDArray<T> aliasedImPermutedJob(E1, E2, jobN, srcCHA);
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D allocate buffer for aliasedIm permuted ... "));
+                    hoNDArrayMemoryManaged<T> aliasedImPermuted(E1, E2, RO, srcCHA, N, gtPlus_mem_manager_);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permuteROTo3rdDimensionFor3DRecon for aliased images ... "));
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo3rdDimensionFor3DRecon(aliasedIm, aliasedImPermuted));
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    // unwrapped images
+                    hoNDArray<T> unwrappedImPermutedJob(E1, E2, jobN, srcCHA, N);
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D allocate buffer for unwrapped images permuted ... "));
+                    hoNDArrayMemoryManaged<T> unwrappedImPermuted(E1, E2, RO, dstCHA, N, gtPlus_mem_manager_);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    // buffer
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D allocate buffer for unwrapping ... "));
+                    hoNDArrayMemoryManaged<T> buffer3DT_unwrapping(E1, E2, jobN, srcCHA, gtPlus_mem_manager_);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    size_t ro=0;
+                    while ( ro<RO )
+                    {
+                        size_t start = ro;
+                        size_t end = ro+jobN-1;
+                        if ( end >= RO )
+                        {
+                            end = RO-1;
+                            start = end-jobN+1;
+                        }
+
+                        GADGET_MSG("grappa 3D - processing " << start << " to " << end << " ... ");
+
+                        if ( (refN<N) || (refN==1) )
+                        {
+                            hoNDArray<T> kImPermuted(kE1, kE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin());
+
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("cropOver3rdDimension hybrid domain kernel ... "));
+                            GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(kImPermuted, kImPermutedJob, start, end));
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("imageDomainKernelE1E2RO ... "));
+                            GADGET_CHECK_RETURN_FALSE(grappa_.imageDomainKernelE1E2RO(kImPermutedJob, E1, E2, kImPermutedZeroFilledJob));
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("cropOver3rdDimension aliased images ... "));
+                            GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(aliasedImPermuted, aliasedImPermutedJob, start, end));
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D apply image domain kernel for every channel and every job ... "));
+                            this->applyImageDomainKernelImage(aliasedImPermutedJob, kImPermutedZeroFilledJob, buffer3DT_unwrapping, unwrappedImPermutedJob);
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("setSubArrayOver3rdDimension unwrapped images ... "));
+                            GADGET_CHECK_RETURN_FALSE(setSubArrayOver3rdDimension(unwrappedImPermutedJob, unwrappedImPermuted, start, end));
+                            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+                        }
+                        else
+                        {
+                            for ( n=0; n<(int)N; n++ )
+                            {
+                                hoNDArray<T> kImPermuted(kE1, kE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+n*kE1*kE2*RO*srcCHA*dstCHA);
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("cropOver3rdDimension hybrid domain kernel ... "));
+                                GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(kImPermuted, kImPermutedJob, start, end));
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("imageDomainKernelE1E2RO ... "));
+                                GADGET_CHECK_RETURN_FALSE(grappa_.imageDomainKernelE1E2RO(kImPermutedJob, E1, E2, kImPermutedZeroFilledJob));
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                hoNDArray<T> aliasedImPermutedN(E1, E2, RO, srcCHA, aliasedImPermuted.begin()+n*E1*E2*RO*srcCHA);
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("cropOver3rdDimension aliased images ... "));
+                                GADGET_CHECK_RETURN_FALSE(cropOver3rdDimension(aliasedImPermutedN, aliasedImPermutedJob, start, end));
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D apply image domain kernel for every channel and every job ... "));
+                                this->applyImageDomainKernelImage(aliasedImPermutedJob, kImPermutedZeroFilledJob, buffer3DT_unwrapping, unwrappedImPermutedJob);
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("setSubArrayOver3rdDimension unwrapped images ... "));
+                                GADGET_CHECK_RETURN_FALSE(setSubArrayOver3rdDimension(unwrappedImPermutedJob, unwrappedImPermuted, start, end));
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+                            }
+                        }
+
+                        ro += jobN;
+                    }
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permuteROTo3rdDimensionFor3DRecon for unwrapped images ... "));
+                    GADGET_CHECK_RETURN_FALSE(Gadgetron::permute3rdDimensionTo1stDimension(unwrappedImPermuted, workOrder3DT->fullkspace_));
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+                }
+                else
+                {
+                    for ( n=0; n<(int)N; n++ )
+                    {
+                        
+                    }
+                }
+            }
+            else
+            {
+                size_t usedN;
+                if ( (refN<N) || (refN==1) )
+                {
+                    hoNDArray<T> kIm(RO, E1, E2, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin());
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D allocate buffer for unwarpping ... "));
+                    hoNDArrayMemoryManaged<T> buffer3DT_unwrapping(RO, E1, E2, srcCHA, gtPlus_mem_manager_);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D apply image domain kernel for every channel ... "));
+                    this->applyImageDomainKernelImage(aliasedIm, kIm, buffer3DT_unwrapping, workOrder3DT->fullkspace_);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->fullkspace_, "unwarppedIm");
+                }
+                else
+                {
+                    hoNDArrayMemoryManaged<T> buffer3DT_unwrapping(RO, E1, E2, srcCHA, dstCHA, gtPlus_mem_manager_);
+
+                    hoNDArray<T> complexIm(RO, E1, E2, dstCHA);
+                    for ( n=0; n<(int)N; n++ )
+                    {
+                        hoNDArray<T> kIm(RO, E1, E2, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+n*RO*E1*E2*srcCHA*dstCHA);
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kIm, "kIm_n");
+
+                        hoNDArray<T> aliasedImN(RO, E1, E2, srcCHA, aliasedIm.begin()+n*RO*E1*E2*srcCHA);
+
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, aliasedImN, "aliasedIm_n");
+
+                        this->applyImageDomainKernelImage(aliasedImN, kIm, buffer3DT_unwrapping, complexIm);
+                        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexIm, "complexIm_n");
+
+                        memcpy(workOrder3DT->fullkspace_.begin()+n*RO*E1*E2*dstCHA, complexIm.begin(), sizeof(T)*RO*E1*E2*dstCHA);
+                    }
+                }
+            }
+
+            if ( (workOrder3DT->coilMap_->get_size(0)==RO) 
+                && (workOrder3DT->coilMap_->get_size(1)==E1) 
+                && (workOrder3DT->coilMap_->get_size(2)==E2) 
+                && (workOrder3DT->coilMap_->get_size(3)==dstCHA) )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D coil combination ... "));
+                gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(workOrder3DT->fullkspace_, *workOrder3DT->coilMap_, workOrder3DT->complexIm_);
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "combined");
+            }
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D go back to kspace ... "));
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft3c(workOrder3DT->fullkspace_);
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        }
+        else
+        {
+            if ( (refN<N) || (refN==1) )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D test ... "));
+                hoNDArray<T> unmixCoeff(RO, E1, E2, srcCHA, workOrder3DT->unmixingCoeffIm_->begin());
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("grappa 3D apply unmixing coeff ... "));
+                this->applyUnmixCoeffImage(aliasedIm, unmixCoeff, workOrder3DT->complexIm_);
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "unwarppedIm");
+            }
+            else
+            {
+                for ( n=0; n<(int)N; n++ )
+                {
+                    hoNDArray<T> unmixCoeff(RO, E1, E2, srcCHA, workOrder3DT->unmixingCoeffIm_->begin()+n*RO*E1*E2*srcCHA);
+                    hoNDArray<T> aliasedImN(RO, E1, E2, srcCHA, aliasedIm.begin()+n*RO*E1*E2*srcCHA);
+                    hoNDArray<T> unwarppedIm(RO, E1, E2, 1, workOrder3DT->complexIm_.begin()+n*RO*E1*E2);
+
+                    this->applyUnmixCoeffImage(aliasedImN, unmixCoeff, unwarppedIm);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedIm, "unwarppedIm");
+                }
+            }
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTGRAPPA<T>::performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTGRAPPA<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        grappa_.gtPlus_mem_manager_ = this->gtPlus_mem_manager_;
+
+        // call the BaseClass
+        GADGET_CHECK_RETURN_FALSE(BaseClass::performRecon(workOrder3DT));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTGRAPPA<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
new file mode 100644
index 0000000..49b3320
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
@@ -0,0 +1,778 @@
+/** \file   gtPlusISMRMRDReconWorker3DTL1SPIRITNCG.h
+    \brief  Implement the 3DT non-linear SPIRIT reconstruction using the non-linear CG solver
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "gtPlusISMRMRDReconWorker3DTSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DOperator.h"
+#include "gtPlusSPIRITNoNullSpace2DTOperator.h"
+#include "gtPlusNCGSolver.h"
+#include "gtPlusWavelet2DOperator.h"
+#include "gtPlusWavelet3DOperator.h"
+#include "gtPlusWaveletNoNullSpace2DOperator.h"
+#include "gtPlusWaveletNoNullSpace3DOperator.h"
+#include "gtPlusDataFidelityOperator.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTL1SPIRITNCG : public gtPlusReconWorker3DTSPIRIT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DTSPIRIT<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+
+    gtPlusReconWorker3DTL1SPIRITNCG() : BaseClass() {}
+    virtual ~gtPlusReconWorker3DTL1SPIRITNCG() {}
+
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n);
+    virtual bool performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& res);
+    virtual bool performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& kspaceLinear, hoNDArray<T>& res);
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+//protected::
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusSPIRIT<T> spirit_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    bool recon_kspace = true;
+    if ( workOrder3DT->spirit_perform_nonlinear_ && workOrder3DT->spirit_use_coil_sen_map_ ) recon_kspace = false;
+    return recon_kspace;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    BaseClass::autoReconParameter(workOrder);
+
+    gtPlusReconWorkOrder3DT<T>* workOrder3DT = dynamic_cast<gtPlusReconWorkOrder3DT<T>*>(workOrder);
+    if ( workOrder3DT == NULL ) return false;
+
+    double acceFactor = workOrder3DT->acceFactorE1_ * workOrder3DT->acceFactorE2_;
+
+    if ( workOrder3DT->spirit_perform_linear_ )
+    {
+        if ( acceFactor>=16 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=12 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=9 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0025;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=6 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else if ( acceFactor>=4 )
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0015;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+        else
+        {
+            workOrder3DT->spirit_3D_scale_per_chunk_ = true;
+
+            if ( workOrder3DT->spirit_solve_symmetric_ )
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.0015;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+            else
+            {
+                workOrder3DT->spirit_image_reg_lamda_ = 0.002;
+                workOrder3DT->spirit_ncg_iter_thres_ = 0.001;
+            }
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n)
+{
+    try
+    {
+        // RO, E1, E2, srcCHA, dstCHA
+
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        size_t srcCHA = adj_forward_G_I.get_size(3);
+        size_t dstCHA = adj_forward_G_I.get_size(4);
+
+        res.create(kspace.get_dimensions());
+
+        // perform the 3D recon by read-out decoupling
+
+        hoNDArrayMemoryManaged<T> kspaceIfftRO(RO, E1, E2, srcCHA, gtPlus_mem_manager_);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kspace, kspaceIfftRO));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftRO, "kspaceIfftRO");
+
+        hoNDArrayMemoryManaged<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO, gtPlus_mem_manager_);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permtue RO to 4th dimension ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo4thDimensionFor3DRecon(kspaceIfftRO, kspaceIfftROPermuted));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftROPermuted, "kspaceIfftROPermuted");
+
+        // permute kernel
+        hoNDArray<T> kerPermuted(E1, E2, srcCHA, dstCHA, RO);
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permute kernel RO to 5th dimension ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteE2To5thDimension( adj_forward_G_I, kerPermuted));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        // permute coil map
+        hoNDArray<T> coilMapN(RO, E1, E2, dstCHA, workOrder3DT->coilMap_->begin()+n*RO*E1*E2*dstCHA);
+        hoNDArray<T> coilMapPermuted(E1, E2, dstCHA, RO);
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permtue coil map RO to 4th dimension ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo4thDimensionFor3DRecon(coilMapN, coilMapPermuted));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, coilMapPermuted, "coilMapPermuted");
+
+        hoNDArray<T> resPermuted(E1, E2, dstCHA, RO);
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImplROPermuted(workOrder3DT, kspaceIfftROPermuted, kerPermuted, coilMapPermuted, resPermuted));
+
+        // permute the unwrapped kspace
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permtue RO to 1st dimension ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo1stDimensionFor3DRecon(resPermuted, kspaceIfftRO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        // perform fft along the first dimension
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(kspaceIfftRO, res));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "res_3DSpirit");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImpl(gtPlusReconWorkOrder3DT<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t E1 = kspace.get_size(0);
+        size_t E2 = kspace.get_size(1);
+        size_t RO = kspace.get_size(3);
+
+        size_t kerE1 = kernel.get_size(0);
+        size_t kerE2 = kernel.get_size(1);
+        size_t srcCHA = kernel.get_size(2);
+        size_t dstCHA = kernel.get_size(3);
+        size_t kerN = kernel.get_size(5);
+
+        hoNDArray<T>* kerIm = &kernel;
+        hoNDArray<T> kerImE1E2RO;
+        if ( kerE1!=E1 || kerE2!=E2 )
+        {
+            GADGET_MSG("gtPlusReconWorker3DTL1SPIRITNCG, kerE1!=E1 || kerE2!=E2, kernel needs to be converted along E1 and E2 ... ");
+
+            if ( gtPlus_mem_manager_ )
+            {
+                // kerImE1E2RO will be cleared as all '0' 
+                kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*E1*E2*srcCHA*dstCHA)));
+            }
+            else
+            {
+                kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN);
+                Gadgetron::clear(kerImE1E2RO);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelE1E2RO(kernel, E1, E2, kerImE1E2RO));
+            kerIm = &kerImE1E2RO;
+        }
+
+        hoNDArray<T> kspaceLinear(kspace);
+        res = kspace;
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace");
+
+        bool performLinear = workOrder3DT->spirit_perform_linear_;
+        if ( !workOrder3DT->spirit_perform_nonlinear_ ) performLinear = true;
+
+        if ( performLinear )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit linear solver for 3DT ... "));
+            GADGET_CHECK_RETURN_FALSE(BaseClass::performUnwarppingImplROPermuted(workOrder3DT, kspace, *kerIm, coilMap, kspaceLinear));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceLinear, "kspaceLinear");
+
+        if ( workOrder3DT->spirit_perform_nonlinear_ )
+        {
+            if ( workOrder3DT->spirit_3D_scale_per_chunk_ )
+            {
+                typename realType<T>::Type scaleFactor = 1.0;
+                Gadgetron::norm2(kspace, scaleFactor);
+                scaleFactor /= (RO*std::sqrt(double(srcCHA)));
+
+                workOrder3DT->spirit_ncg_scale_factor_ = scaleFactor;
+            }
+
+            // apply the scale
+            Gadgetron::scal(T(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspaceLinear);
+            Gadgetron::scal(T(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspace);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapN;
+            if ( workOrder3DT->coilMap_ 
+                && workOrder3DT->coilMap_->get_size(0)==E1 
+                && workOrder3DT->coilMap_->get_size(1)==E2 
+                && workOrder3DT->coilMap_->get_size(2)==dstCHA 
+                && workOrder3DT->coilMap_->get_size(3)==RO )
+            {
+                coilMapN = boost::shared_ptr< hoNDArray<T> >( new hoNDArray<T>(E1, E2, dstCHA, RO, coilMap.begin()) );
+            }
+
+            if ( RO > 1 )
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, RO, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, RO, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DTOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setMemoryManager(gtPlus_mem_manager_);
+
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet3DOperator<T> wavNullSpace3DOperator;
+                    wavNullSpace3DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNullSpace3DOperator.scale_factor_first_dimension_ = workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_second_dimension_ = workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_third_dimension_ = workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace3DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3DT ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3DT_res");
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3DT_res_restored");
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DTOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setMemoryManager(gtPlus_mem_manager_);
+
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace3DOperator<T> wavNoNullSpace3DOperator;
+                    wavNoNullSpace3DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNoNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNoNullSpace3DOperator.scale_factor_first_dimension_ = workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_second_dimension_ = workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_third_dimension_ = workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace3DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, T(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3DT without null space ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3DT_res_noNullSpace");
+                }
+            }
+            else
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setMemoryManager(gtPlus_mem_manager_);
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet2DOperator<T> wavNullSpace2DOperator;
+                    wavNullSpace2DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace2DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3D ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3D_res");
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3D_res_restored");
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setMemoryManager(gtPlus_mem_manager_);
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace2DOperator<T> wavNoNullSpace2DOperator;
+                    wavNoNullSpace2DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNoNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace2DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, T(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3D without null space ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3D_res_noNullSpace");
+                }
+            }
+
+            Gadgetron::scal(T(workOrder3DT->spirit_ncg_scale_factor_), res);
+        }
+        else
+        {
+            res = kspaceLinear;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImplROPermuted(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& kernel, hoNDArray<T>& coilMap, hoNDArray<T>& kspaceLinear, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t E1 = kspace.get_size(0);
+        size_t E2 = kspace.get_size(1);
+        size_t RO = kspace.get_size(3);
+
+        size_t kerE1 = kernel.get_size(0);
+        size_t kerE2 = kernel.get_size(1);
+        size_t srcCHA = kernel.get_size(2);
+        size_t dstCHA = kernel.get_size(3);
+        size_t kerN = kernel.get_size(5);
+
+        hoNDArray<T>* kerIm = &kernel;
+        hoNDArray<T> kerImE1E2RO;
+        if ( kerE1!=E1 || kerE2!=E2 )
+        {
+            GADGET_MSG("gtPlusReconWorker3DTL1SPIRITNCG, kerE1!=E1 || kerE2!=E2, kernel needs to be converted along E1 and E2 ... ");
+
+            if ( gtPlus_mem_manager_ )
+            {
+                // kerImE1E2RO will be cleared as all '0' 
+                kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*E1*E2*srcCHA*dstCHA)));
+            }
+            else
+            {
+                kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN);
+                Gadgetron::clear(kerImE1E2RO);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelE1E2RO(kernel, E1, E2, kerImE1E2RO));
+            kerIm = &kerImE1E2RO;
+        }
+
+        res = kspace;
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace, "kspace");
+
+        bool performLinear = workOrder3DT->spirit_perform_linear_;
+        if ( !workOrder3DT->spirit_perform_nonlinear_ ) performLinear = true;
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceLinear, "kspaceLinear");
+
+        if ( workOrder3DT->spirit_perform_nonlinear_ )
+        {
+            if ( workOrder3DT->spirit_3D_scale_per_chunk_ )
+            {
+                typename realType<T>::Type scaleFactor = 1.0;
+                Gadgetron::norm2(kspace, scaleFactor);
+                scaleFactor /= (RO*std::sqrt(double(srcCHA)));
+
+                workOrder3DT->spirit_ncg_scale_factor_ = scaleFactor;
+            }
+
+            // apply the scale
+            Gadgetron::scal(T(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspaceLinear);
+            Gadgetron::scal(T(1.0/workOrder3DT->spirit_ncg_scale_factor_), kspace);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapN;
+            if ( workOrder3DT->coilMap_ 
+                && workOrder3DT->coilMap_->get_size(0)==E1 
+                && workOrder3DT->coilMap_->get_size(1)==E2 
+                && workOrder3DT->coilMap_->get_size(2)==dstCHA 
+                && workOrder3DT->coilMap_->get_size(3)==RO )
+            {
+                coilMapN = boost::shared_ptr< hoNDArray<T> >( new hoNDArray<T>(E1, E2, dstCHA, RO, coilMap.begin()) );
+            }
+
+            if ( RO > 1 )
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, RO, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, RO, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DTOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setMemoryManager(gtPlus_mem_manager_);
+
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet3DOperator<T> wavNullSpace3DOperator;
+                    wavNullSpace3DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNullSpace3DOperator.scale_factor_first_dimension_ = workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_second_dimension_ = workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNullSpace3DOperator.scale_factor_third_dimension_ = workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace3DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3DT ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3DT_res");
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3DT_res_restored");
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DTOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setMemoryManager(gtPlus_mem_manager_);
+
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace3DOperator<T> wavNoNullSpace3DOperator;
+                    wavNoNullSpace3DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNoNullSpace3DOperator.setAcquiredPoints(acq);
+
+                    wavNoNullSpace3DOperator.scale_factor_first_dimension_ = workOrder3DT->spirit_E1_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_second_dimension_ = workOrder3DT->spirit_E2_enhancement_ratio_;
+                    wavNoNullSpace3DOperator.scale_factor_third_dimension_ = workOrder3DT->spirit_RO_enhancement_ratio_;
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace3DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace3DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, T(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3DT without null space ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3DT_res_noNullSpace");
+                }
+            }
+            else
+            {
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, kerIm->begin()));
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspace.begin()));
+
+                gtPlusNCGSolver<hoNDArray<T>, hoNDArray<T>, gtPlusOperator<T> > ncgsolver;
+                ncgsolver.iterMax_ = workOrder3DT->spirit_ncg_iter_max_;
+                ncgsolver.printIter_ = workOrder3DT->spirit_ncg_print_iter_;
+                ncgsolver.secantRatio_ = 2;
+                ncgsolver.x0_ = &kspaceLinear;
+
+                hoNDArray<T> b;
+
+                if ( workOrder3DT->spirit_data_fidelity_lamda_ <= 0 )
+                {
+                    // parallel imaging term
+                    gtPlusSPIRIT2DOperator<T> spirit;
+                    spirit.use_symmetric_spirit_ = false;
+                    spirit.setMemoryManager(gtPlus_mem_manager_);
+                    spirit.setForwardKernel(ker, true);
+                    spirit.setAcquiredPoints(acq);
+
+                    // L1 term
+                    gtPlusWavelet2DOperator<T> wavNullSpace2DOperator;
+                    wavNullSpace2DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    // set operators
+                    ncgsolver.add(spirit, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNullSpace2DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3D ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3D_res");
+
+                    spirit.restoreAcquiredKSpace(kspace, res);
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3D_res_restored");
+                }
+                else
+                {
+                    gtPlusSPIRITNoNullSpace2DOperator<T> spirit_noNullSpace;
+                    spirit_noNullSpace.use_symmetric_spirit_ = false;
+                    spirit_noNullSpace.setMemoryManager(gtPlus_mem_manager_);
+                    spirit_noNullSpace.setForwardKernel(ker, true);
+                    spirit_noNullSpace.setAcquiredPoints(acq);
+
+                    gtPlusDataFidelityOperator<T> dataOper;
+                    dataOper.setAcquiredPoints(acq);
+
+                    gtPlusWaveletNoNullSpace2DOperator<T> wavNoNullSpace2DOperator;
+                    wavNoNullSpace2DOperator.setMemoryManager(gtPlus_mem_manager_);
+                    wavNoNullSpace2DOperator.setAcquiredPoints(acq);
+
+                    if ( workOrder3DT->spirit_use_coil_sen_map_ && coilMapN )
+                    {
+                        wavNoNullSpace2DOperator.setCoilSenMap(coilMapN);
+                    }
+
+                    ncgsolver.add(spirit_noNullSpace, T(workOrder3DT->spirit_parallel_imaging_lamda_) );
+                    ncgsolver.add(wavNoNullSpace2DOperator, T(workOrder3DT->spirit_image_reg_lamda_) );
+                    ncgsolver.add(dataOper, T(workOrder3DT->spirit_data_fidelity_lamda_) );
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("NCG spirit solver for 3D without null space ... "));
+                    ncgsolver.solve(b, res);
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "ncg_spirit_3D_res_noNullSpace");
+                }
+            }
+
+            Gadgetron::scal(T(workOrder3DT->spirit_ncg_scale_factor_), res);
+        }
+        else
+        {
+            res = kspaceLinear;
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImplROPermuted(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTL1SPIRITNCG<T>::
+    performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace; // [E1 E2 srcCHA RO 1]
+        hoNDArray<T>& ker = job.ker; // [E1 E2 srcCHA dstCHA RO 1]
+        hoNDArray<T>& res = job.res; // [E1 E2 dstCHA RO 1]
+        gtPlusReconWorkOrder<T>* workOrder3DT = &(job.workOrder2DT);
+
+        job.res = job.kspace;
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImplROPermuted(workOrder3DT, kspace, ker, *job.workOrder2DT.coilMap_, res));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTL1SPIRITNCG<T>::performUnwarppingImpl(job) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
new file mode 100644
index 0000000..64dc4dc
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTNoAcceleration.h
@@ -0,0 +1,157 @@
+/** \file   gtPlusISMRMRDReconWorker3DTNoAcceleration.h
+    \brief  Implement the 3DT reconstruction without the k-space undersampling
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+
+#include "GadgetronTimer.h"
+
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker3DT.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTNoAcceleration : public gtPlusReconWorker3DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DT<T> BaseClass;
+
+    gtPlusReconWorker3DTNoAcceleration() : BaseClass() {}
+    virtual ~gtPlusReconWorker3DTNoAcceleration() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* /*workOrder3DT*/) { return false; }
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+    using BaseClass::startE2_;
+    using BaseClass::endE2_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        if ( !workOrder3DT->workFlow_use_BufferedKernel_ )
+        {
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("prepRef"));
+            GADGET_CHECK_RETURN_FALSE(prepRef(workOrder3DT, workOrder3DT->ref_, 
+                                            workOrder3DT->ref_recon_, 
+                                            workOrder3DT->ref_coil_map_, 
+                                            workOrder3DT->start_RO_, workOrder3DT->end_RO_, 
+                                            workOrder3DT->start_E1_, workOrder3DT->end_E1_, 
+                                            workOrder3DT->start_E2_, workOrder3DT->end_E2_, 
+                                            workOrder3DT->data_.get_size(1), workOrder3DT->data_.get_size(2)));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+        }
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t CHA = workOrder3DT->data_.get_size(3);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t refN = workOrder3DT->ref_recon_.get_size(4);
+        size_t usedN;
+
+        // estimate the coil sensitivity
+        if ( !workOrder3DT->workFlow_use_BufferedKernel_ 
+                    || (workOrder3DT->coilMap_->get_size(0)!=RO) 
+                    || (workOrder3DT->coilMap_->get_size(1)!=E1)
+                    || (workOrder3DT->coilMap_->get_size(2)!=E2) )
+        {
+            workOrder3DT->coilMap_->create(RO, E1, E2, CHA, refN);
+
+            if ( workOrder3DT->no_acceleration_same_combinationcoeff_allN_ )
+            {
+                usedN = workOrder3DT->no_acceleration_whichN_combinationcoeff_;
+                if ( usedN >= refN ) usedN = refN-1;
+
+                hoNDArray<T> refCoilMapN(RO, E1, E2, CHA, workOrder3DT->ref_coil_map_.begin()+usedN*RO*E1*E2*CHA);
+
+                hoNDArrayMemoryManaged<T> buffer3DT(refCoilMapN.get_dimensions(), gtPlus_mem_manager_);
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(refCoilMapN, buffer3DT));
+
+                hoNDArray<T> coilMapN(RO, E1, E2, CHA, workOrder3DT->coilMap_->begin()+usedN*RO*E1*E2*CHA);
+
+                if ( workOrder3DT->csm_use_gpu_ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(buffer3DT, 
+                            coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                            coilMapN, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                }
+                GADGET_CHECK_RETURN_FALSE(repmatLastDimension(*workOrder3DT->coilMap_, usedN));
+            }
+            else
+            {
+                hoNDArrayMemoryManaged<T> buffer3DT(workOrder3DT->ref_coil_map_.get_dimensions(), gtPlus_mem_manager_);
+
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->ref_coil_map_, buffer3DT));
+
+                if ( workOrder3DT->csm_use_gpu_ )
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIHGPU_FullResMap(buffer3DT, 
+                            *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                }
+                else
+                {
+                    GADGET_CHECK_RETURN_FALSE(gtPlusISMRMRDReconUtilComplex<T>().coilMap3DNIH(buffer3DT, 
+                            *workOrder3DT->coilMap_, workOrder3DT->coil_map_algorithm_, workOrder3DT->csm_kSize_, workOrder3DT->csm_powermethod_num_, workOrder3DT->csm_iter_num_, workOrder3DT->csm_iter_thres_, workOrder3DT->csm_true_3D_));
+                }
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->coilMap_, "coilMap_");
+        }
+
+        // partial fourier handling
+        GADGET_CHECK_RETURN_FALSE(this->performPartialFourierHandling(workOrder3DT));
+
+        workOrder3DT->complexIm_.create(RO, E1, E2, N);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.start("perform coil combination"));
+
+        hoNDArrayMemoryManaged<T> buffer3DT(workOrder3DT->data_.get_dimensions(), gtPlus_mem_manager_);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->data_, buffer3DT);
+        gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(buffer3DT, *workOrder3DT->coilMap_, workOrder3DT->complexIm_);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer1_.stop());
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "combined");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTNoAcceleration<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h
new file mode 100644
index 0000000..3f77bfa
--- /dev/null
+++ b/toolboxes/gtplus/workflow/gtPlusISMRMRDReconWorker3DTSPIRIT.h
@@ -0,0 +1,1028 @@
+/** \file   gtPlusISMRMRDReconWorker3DTSPIRIT.h
+    \brief  Implement the 3DT linear SPIRIT reconstruction
+    \author Hui Xue
+*/
+
+#pragma once
+
+#include "ismrmrd.h"
+#include "GadgetronTimer.h"
+#include "gtPlusISMRMRDReconUtil.h"
+#include "gtPlusISMRMRDReconWorker3DT.h"
+#include "gtPlusSPIRIT.h"
+#include "gtPlusSPIRIT2DTOperator.h"
+#include "gtPlusLSQRSolver.h"
+
+#include "GadgetCloudController.h"
+#include "GadgetCloudJobMessageReadWrite.h"
+
+namespace Gadgetron { namespace gtPlus {
+
+template <typename T> 
+class gtPlusReconWorker3DTSPIRIT : public gtPlusReconWorker3DT<T>
+{
+public:
+
+    typedef gtPlusReconWorker3DT<T> BaseClass;
+    typedef gtPlusReconWorkOrder3DT<T> WorkOrderType;
+
+    gtPlusReconWorker3DTSPIRIT() : spirit_kernelIm_permuted_(false), BaseClass() {}
+    virtual ~gtPlusReconWorker3DTSPIRIT() {}
+
+    virtual bool performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT);
+    virtual bool performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN);
+
+    virtual bool performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data);
+    virtual bool performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, hoNDArray<T>& coilMap, hoNDArray<T>& res);
+    virtual bool performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n);
+    virtual bool performUnwarppingImpl(gtPlusReconJob2DT<T>& job);
+
+    virtual bool computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT);
+
+    virtual bool autoReconParameter(gtPlusReconWorkOrder<T>* workOrder);
+
+    virtual bool splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN);
+
+    using BaseClass::gt_timer1_;
+    using BaseClass::gt_timer2_;
+    using BaseClass::gt_timer3_;
+    using BaseClass::performTiming_;
+    using BaseClass::gt_exporter_;
+    using BaseClass::debugFolder_;
+    using BaseClass::gtPlus_util_;
+    using BaseClass::gtPlus_mem_manager_;
+
+//protected::
+
+    using BaseClass::ref_src_;
+    using BaseClass::ref_dst_;
+    using BaseClass::data_dst_;
+    using BaseClass::ref_coil_map_dst_;
+    using BaseClass::startE1_;
+    using BaseClass::endE1_;
+
+    gtPlusSPIRIT<T> spirit_;
+
+    bool spirit_kernelIm_permuted_;
+};
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::computeKSpace(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    bool recon_kspace = true;
+    return recon_kspace;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::autoReconParameter(gtPlusReconWorkOrder<T>* workOrder)
+{
+    gtPlusReconWorkOrder3DT<T>* workOrder3DT = dynamic_cast<gtPlusReconWorkOrder3DT<T>*>(workOrder);
+    if ( workOrder3DT == NULL ) return false;
+
+    double acceFactor = workOrder3DT->acceFactorE1_ * workOrder3DT->acceFactorE2_;
+
+    if ( acceFactor>=16 )
+    {
+        workOrder3DT->spirit_iter_max_ = 150;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=12 )
+    {
+        workOrder3DT->spirit_iter_max_ = 100;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=9 )
+    {
+        workOrder3DT->spirit_iter_max_ = 100;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=6 )
+    {
+        workOrder3DT->spirit_iter_max_ = 100;
+        workOrder3DT->spirit_iter_thres_ = 0.0025;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else if ( acceFactor>=4 )
+    {
+        workOrder3DT->spirit_iter_max_ = 70;
+        workOrder3DT->spirit_iter_thres_ = 0.005;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+    }
+    else
+    {
+        workOrder3DT->spirit_iter_max_ = 50;
+        workOrder3DT->spirit_iter_thres_ = 0.005;
+        workOrder3DT->spirit_reg_lamda_ = 0.01;
+
+        if ( workOrder3DT->recon_algorithm_ == ISMRMRD_embedded )
+        {
+            workOrder3DT->spirit_iter_thres_ = 0.005;
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performCalibPrep(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT)
+{
+    spirit_.performTiming_ = performTiming_;
+    spirit_.debugFolder_ = debugFolder_;
+
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = ref_src.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    size_t kRO = workOrder3DT->spirit_kSize_RO_;
+    size_t kE1 = workOrder3DT->spirit_kSize_E1_;
+    size_t kE2 = workOrder3DT->spirit_kSize_E2_;
+
+    workOrder3DT->kernel_->create(kRO, kE1, kE2, srcCHA, dstCHA, 1, 1, 1, refN);
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("allocate image domain kernel ... "));
+        if ( gtPlus_mem_manager_ )
+        {
+            if ( workOrder3DT->kernelIm_->get_number_of_elements() != (size_t)RO*E1*E2*srcCHA*dstCHA*refN )
+            {
+                workOrder3DT->kernelIm_->create(E1, E2, RO, srcCHA, dstCHA, refN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*E1*E2*srcCHA*dstCHA*refN)));
+            }
+        }
+        else
+        {
+            workOrder3DT->kernelIm_->create(E1, E2, RO, srcCHA, dstCHA, refN);
+            // pre-set to zero is needed here
+            memset(workOrder3DT->kernelIm_->begin(), 0, workOrder3DT->kernelIm_->get_number_of_bytes());
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+    else
+    {
+        size_t convKE1 = 2*kE1-1;
+        size_t convKE2 = 2*kE2-1;
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("allocate image domain kernel only along RO ... "));
+        if ( gtPlus_mem_manager_ )
+        {
+            if ( workOrder3DT->kernelIm_->get_number_of_elements() != (size_t)RO*convKE1*convKE2*srcCHA*dstCHA*refN )
+            {
+                workOrder3DT->kernelIm_->create(convKE1, convKE2, RO, srcCHA, dstCHA, refN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*convKE1*convKE2*srcCHA*dstCHA*refN)));
+            }
+        }
+        else
+        {
+            workOrder3DT->kernelIm_->create(convKE1, convKE2, RO, srcCHA, dstCHA, refN);
+            // pre-set to zero is needed here
+            memset(workOrder3DT->kernelIm_->begin(), 0, workOrder3DT->kernelIm_->get_number_of_bytes());
+        }
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performCalibImpl(const hoNDArray<T>& ref_src, const hoNDArray<T>& ref_dst, WorkOrderType* workOrder3DT, size_t usedN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+    size_t N = workOrder3DT->data_.get_size(4);
+    size_t srcCHA = ref_src.get_size(3);
+
+    size_t refRO = ref_dst.get_size(0);
+    size_t refE1 = ref_dst.get_size(1);
+    size_t refE2 = ref_dst.get_size(2);
+    size_t refN = ref_dst.get_size(4);
+    size_t dstCHA = ref_dst.get_size(3);
+
+    bool reconKSpace = this->computeKSpace(workOrder3DT);
+
+    size_t kRO = workOrder3DT->spirit_kSize_RO_;
+    size_t kE1 = workOrder3DT->spirit_kSize_E1_;
+    size_t kE2 = workOrder3DT->spirit_kSize_E2_;
+
+    ho4DArray<T> acsSrc(refRO, refE1, refE2, srcCHA, const_cast<T*>(ref_src.begin()+usedN*refRO*refE1*refE2*srcCHA));
+    ho4DArray<T> acsDst(refRO, refE1, refE2, dstCHA, const_cast<T*>(ref_dst.begin()+usedN*refRO*refE1*refE2*dstCHA));
+
+    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsSrc, "acsSrc");
+    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, acsDst, "acsDst");
+
+    hoNDArray<T> ker(kRO, kE1, kE2, srcCHA, dstCHA, 1, 1, 1, workOrder3DT->kernel_->begin()+usedN*kRO*kE1*kE2*srcCHA*dstCHA);
+
+    spirit_.calib_use_gpu_ = workOrder3DT->spirit_use_gpu_;
+
+    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3D calibration ... "));
+    GADGET_CHECK_RETURN_FALSE(spirit_.calib3D(acsSrc, acsDst, workOrder3DT->spirit_reg_lamda_, workOrder3DT->spirit_calib_over_determine_ratio_, kRO, kE1, kE2, 1, 1, 1, ker));
+    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ker, "ker");
+
+    bool minusI = true;
+
+    size_t jobN;
+    bool splitJobs = this->splitJob(workOrder3DT, jobN);
+
+    if ( !splitJobs )
+    {
+        hoNDArray<T> kIm(E1, E2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*E1*E2*RO*srcCHA*dstCHA);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3D image domain kernel ... "));
+        GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernel3D(ker, kRO, kE1, kE2, 1, 1, 1, RO, E1, E2, kIm, minusI));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        if ( !debugFolder_.empty() )
+        {
+            hoNDArray<T> kImACha(E1, E2, RO, srcCHA, kIm.begin());
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImACha, "kImACha");
+        }
+    }
+    else
+    {
+        size_t convKE1 = 2*kE1-1;
+        size_t convKE2 = 2*kE2-1;
+
+        hoNDArray<T> kIm(convKE1, convKE2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+usedN*convKE1*convKE2*RO*srcCHA*dstCHA);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3D image domain kernel only along RO ... "));
+        GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelRO3D(ker, kRO, kE1, kE2, 1, 1, 1, RO, kIm, minusI));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        if ( !debugFolder_.empty() )
+        {
+            hoNDArray<T> kImROACha(convKE1, convKE2, RO, srcCHA, kIm.begin());
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kImROACha, "kImROACha");
+        }
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+splitJob(gtPlusReconWorkOrder3DT<T>* workOrder3DT, size_t& jobN)
+{
+    size_t RO = workOrder3DT->data_.get_size(0);
+    size_t E1 = workOrder3DT->data_.get_size(1);
+    size_t E2 = workOrder3DT->data_.get_size(2);
+
+    size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+    size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+    bool splitByS = workOrder3DT->job_split_by_S_;
+    jobN = workOrder3DT->job_num_of_N_;
+    size_t jobMegaBytes = workOrder3DT->job_max_Megabytes_;
+
+    bool splitJobs = (splitByS==true || jobN>0);
+    if ( !splitJobs )
+    {
+        if ( jobMegaBytes>0 )
+        {
+            size_t jobN = jobMegaBytes/(E1*E2*srcCHA*dstCHA*sizeof(T)/1024/1024);
+            if ( jobN < RO ) splitJobs = true;
+            GADGET_MSG("SPIRIT - 3DT - size of largest job : " << jobN);
+        }
+    }
+    if ( jobN >= RO ) splitJobs = false;
+
+    return splitJobs;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data_dst)
+{
+    try
+    {
+        int n;
+
+        size_t RO = workOrder3DT->data_.get_size(0);
+        size_t E1 = workOrder3DT->data_.get_size(1);
+        size_t E2 = workOrder3DT->data_.get_size(2);
+        size_t N = workOrder3DT->data_.get_size(4);
+
+        size_t kImE1 = workOrder3DT->kernelIm_->get_size(0);
+        size_t kImE2 = workOrder3DT->kernelIm_->get_size(1);
+        size_t kImRO = workOrder3DT->kernelIm_->get_size(2);
+        size_t srcCHA = workOrder3DT->kernelIm_->get_size(3);
+        size_t dstCHA = workOrder3DT->kernelIm_->get_size(4);
+
+        size_t refN = workOrder3DT->kernelIm_->get_size(5);
+
+        workOrder3DT->complexIm_.create(RO, E1, E2, N);
+
+        // downstream coil compression is not supported here
+        // kspace is always reconed
+        workOrder3DT->fullkspace_ = data_dst;
+
+        // compute the scaling factor
+        typename realType<T>::Type scaleFactor = 1.0;
+        hoNDArray<T> kspaceForScaleFactor(RO, E1, E2, srcCHA, const_cast<T*>(data_dst.begin()));
+        Gadgetron::norm2(kspaceForScaleFactor, scaleFactor);
+        scaleFactor /= (RO*std::sqrt(double(srcCHA)));
+
+        workOrder3DT->spirit_ncg_scale_factor_ = scaleFactor;
+
+        // split the jobs
+        size_t jobMegaBytes = workOrder3DT->job_max_Megabytes_;
+        size_t jobN = workOrder3DT->job_num_of_N_;
+        bool splitJobs = this->splitJob(workOrder3DT, jobN);
+        size_t maxNumOfBytesPerJob = jobMegaBytes*1024*1024;
+
+        size_t overlapN = workOrder3DT->job_overlap_;
+        if ( workOrder3DT->recon_algorithm_==ISMRMRD_SPIRIT )
+        {
+            overlapN = 0;
+        }
+
+        if ( splitJobs )
+        {
+            // hoNDArrayMemoryManaged<T> kspaceIfftRO(RO, E1, E2, srcCHA, N, gtPlus_mem_manager_);
+            hoNDArray<T> kspaceIfftRO(RO, E1, E2, srcCHA, N);
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(data_dst, kspaceIfftRO));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftRO, "kspaceIfftRO");
+
+            // hoNDArrayMemoryManaged<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO, N, gtPlus_mem_manager_);
+            hoNDArray<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO, N);
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permute kspace RO to 4th dimension ... "));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo4thDimensionFor3DRecon(kspaceIfftRO, kspaceIfftROPermuted));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftROPermuted, "kspaceIfftROPermuted");
+
+            hoNDArrayMemoryManaged<T> kerPermuted;
+            if ( !spirit_kernelIm_permuted_ )
+            {
+                spirit_kernelIm_permuted_ = true;
+
+                size_t kerN = kImE1*kImE2*srcCHA*dstCHA*kImRO*N;
+                size_t kerImSize = sizeof(T)*kerN;
+                GADGET_MSG("SPIRIT - 3DT - image domain kernel size : " << kerImSize/1024.0/1024 << " MBytes ... ");
+                size_t maxFreeChunk = gtPlus_mem_manager_->maxFreeMemoryChunkSize();
+                GADGET_MSG("SPIRIT - 3DT - maximal free chunk of managed memory : " << maxFreeChunk/1024.0/1024 << " MBytes ... ");
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("allocate permuted kernel ... "));
+                if ( maxFreeChunk >= kerImSize )
+                {
+                    kerPermuted.setMemoryManager(gtPlus_mem_manager_);
+                    kerPermuted.create(kImE1, kImE2, srcCHA, dstCHA, kImRO, N);
+                }
+                else
+                {
+                    GADGET_MSG("use unmanaged memory ... ");
+                    T* pData = new T[kerN];
+                    kerPermuted.create(kImE1, kImE2, srcCHA, dstCHA, kImRO, N, pData, true);
+                }
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *workOrder3DT->kernelIm_, "kernelImBeforePermuted");
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permute kernel RO to 5th dimension ... "));
+                // GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo5thDimensionFor3DRecon( *workOrder3DT->kernelIm_, kerPermuted));
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteE2To5thDimension( *workOrder3DT->kernelIm_, kerPermuted));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kerPermuted, "kerPermuted");
+
+                workOrder3DT->kernelIm_->reshape(kerPermuted.get_dimensions());
+                *workOrder3DT->kernelIm_ = kerPermuted;
+
+                kerPermuted.clear();
+
+                kerPermuted.create(kImE1, kImE2, srcCHA, dstCHA, kImRO, N, workOrder3DT->kernelIm_->begin());
+            }
+            else
+            {
+                kerPermuted.create(E1, E2, srcCHA, dstCHA, RO, N, workOrder3DT->kernelIm_->begin());
+            }
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kerPermuted, "kerPermuted_Used");
+
+            gtPlusReconWorkOrder3DT<T> workOrder3DTJobSplit;
+            workOrder3DT->duplicate(workOrder3DTJobSplit);
+
+            boost::shared_ptr< hoNDArray<T> > coilMapPermuted = boost::shared_ptr< hoNDArray<T> >(new hoNDArray<T>()) ;
+            if ( workOrder3DT->coilMap_->get_number_of_elements() > 0 )
+            {
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permute coil map RO to 4th dimension ... "));
+                GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo4thDimensionFor3DRecon(*workOrder3DT->coilMap_, *coilMapPermuted));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftROPermuted, "coilMapPermuted");
+
+                workOrder3DTJobSplit.coilMap_ = coilMapPermuted;
+            }
+
+            bool runJobsOnCloud = workOrder3DT->CloudComputing_;
+            unsigned int cloudSize = workOrder3DT->CloudSize_;
+            bool runJobsOnLocalNode = workOrder3DT->job_perform_on_control_node_;
+
+            std::vector<gtPlusReconJob2DT<T> > jobList;
+
+            if ( runJobsOnCloud )
+            {
+                unsigned int j;
+
+                GADGET_CHECK_RETURN_FALSE(this->estimateJobSize(workOrder3DT, maxNumOfBytesPerJob, overlapN, cloudSize, jobN));
+
+                //GADGET_MSG("SPIRIT - 3DT - cloudSize is " << cloudSize << " - RO is " << RO << " ... ");
+                //unsigned int nodeN = cloudSize;
+                //if ( runJobsOnLocalNode ) nodeN++;
+                //GADGET_MSG("SPIRIT - 3DT - runJobsOnLocalNode is " << runJobsOnLocalNode << " - nodeN is " << nodeN << " - overlapN is " << overlapN << " ... ");
+
+                //// adjust jobN according to cloud size
+                //jobN = std::ceil( (double)(RO+overlapN*(nodeN-1))/(double)nodeN );
+
+                //size_t numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobN + 2*E1*E2*srcCHA*jobN );
+
+                //while ( numOfBytesPerJob > 2.2*1024*1024*1024-64.0*1024*1024 )
+                //{
+                //    nodeN *= 2;
+                //    jobN = std::ceil( (double)(RO+overlapN*(nodeN-1))/(double)nodeN );
+                //    numOfBytesPerJob = sizeof(T)*( E1*E2*srcCHA*dstCHA*jobN + 2*E1*E2*srcCHA*jobN );
+                //}
+
+                //GADGET_MSG("SPIRIT - 3DT - every job will have " << numOfBytesPerJob/1024.0/1024 << " MBytes ... ");
+
+                // split the job
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(&workOrder3DTJobSplit, kspaceIfftROPermuted, kerPermuted, workOrder3DT->job_split_by_S_, jobN, jobMegaBytes, overlapN, jobList));
+
+                std::vector<gtPlusReconJob2DT<T> > completedJobList(jobList.size());
+
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    jobList[j].workOrder2DT.duplicate(completedJobList[j].workOrder2DT);
+                    completedJobList[j].job_index_startN_ = jobList[j].job_index_startN_;
+                    completedJobList[j].job_index_endN_ = jobList[j].job_index_endN_;
+                    completedJobList[j].job_index_S_ = jobList[j].job_index_S_;
+                }
+
+                GADGET_MSG("SPIRIT - 3DT - total job : " << jobList.size() << " - job N : " << jobN << " - cloud size : " << cloudSize);
+
+                unsigned int numOfJobRunOnCloud = jobList.size() - jobList.size()/(cloudSize+1);
+                if ( !runJobsOnLocalNode ) numOfJobRunOnCloud = jobList.size();
+
+                typedef Gadgetron::GadgetCloudController< gtPlusReconJob2DT<T> > GTCloudControllerType;
+                GTCloudControllerType controller;
+
+                if (controller.open () == -1)
+                {
+                    GADGET_ERROR_MSG("Cloud controller cannot open the cloud ...");
+                    controller.handle_close (ACE_INVALID_HANDLE, 0);
+                    runJobsOnCloud = false;
+                }
+                else
+                {
+                    std::vector<gtPlusReconJob2DT<T>* > jobListCloud(numOfJobRunOnCloud);
+                    std::vector<gtPlusReconJob2DT<T>* > completedJobListCloud(numOfJobRunOnCloud);
+                    std::vector<int> node_ids(numOfJobRunOnCloud);
+
+                    GADGET_CHECK_RETURN_FALSE(this->scheduleJobForNodes(workOrder3DT, numOfJobRunOnCloud, node_ids));
+
+                    for ( j=0; j<numOfJobRunOnCloud; j++ )
+                    {
+                        // node_ids[j] = j%cloudSize;
+                        jobListCloud[j] = &jobList[j];
+                        completedJobListCloud[j] = &completedJobList[j];
+                        GADGET_MSG("--> job " << j << " runs on node " << node_ids[j] << " ... ");
+                    }
+
+                    std::vector<GadgetMessageReader*> readers(cloudSize, NULL);
+                    std::vector<GadgetMessageWriter*> writers(cloudSize, NULL);
+
+                    for ( j=0; j<cloudSize; j++ )
+                    {
+                        readers[j] = new GtPlusCloudJobMessageReaderCPFL();
+                        writers[j] = new GtPlusCloudJobMessageWriterCPFL();
+                    }
+
+                    if ( controller.createConnector(workOrder3DT->gt_cloud_, GADGET_MESSAGE_CLOUD_JOB, readers, GADGET_MESSAGE_CLOUD_JOB, writers) != 0 )
+                    {
+                        GADGET_ERROR_MSG("Cloud controller creates connectors failed ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else if ( controller.connectToCloud(workOrder3DT->gt_cloud_) != 0 )
+                    {
+                        GADGET_ERROR_MSG("Cloud controller cannot connect to the cloud ...");
+                        controller.handle_close (ACE_INVALID_HANDLE, 0);
+                        runJobsOnCloud = false;
+                    }
+                    else
+                    {
+                        if ( controller.runJobsOnCloud(jobListCloud, completedJobListCloud, node_ids) != 0 )
+                        {
+                            GADGET_ERROR_MSG("Cloud controller runs jobs on the cloud failed ...");
+                            controller.closeCloudNode();
+                            controller.handle_close (ACE_INVALID_HANDLE, 0);
+                            runJobsOnCloud = false;
+                        }
+                        else
+                        {
+                            controller.closeCloudNode();
+
+                            // run the left over jobs on the local computer
+                            for ( j=numOfJobRunOnCloud; j<jobList.size(); j++ )
+                            {
+                                GADGET_MSG("SPIRIT - 3DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3DT ... "));
+                                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                std::ostringstream ostr;
+                                ostr << "job_fullkspace" << "_" << j;
+                                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, jobList[j].res, ostr.str());
+                            }
+
+                            // wait the cloud job to complete
+                            controller.waitForJobToComplete();
+
+                            // combine results from cloud and local run
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                jobList[j].res = controller.completed_job_list_[j]->res;
+                                jobList[j].complexIm = controller.completed_job_list_[j]->complexIm;
+                            }
+
+                            // if some jobs are not actually completed, process them
+                            for ( j=0; j<numOfJobRunOnCloud; j++ )
+                            {
+                                if ( 
+                                    !jobList[j].res.dimensions_equal(&jobList[j].kspace) 
+                                        && 
+                                    ( jobList[j].complexIm.get_size(0)!= jobList[j].kspace.get_size(0) 
+                                    || jobList[j].complexIm.get_size(1)!= jobList[j].kspace.get_size(1) 
+                                    || jobList[j].complexIm.get_size(2)!= jobList[j].kspace.get_size(2) ) 
+                                   )
+                                {
+                                    GADGET_MSG("SPIRIT - 3DT - uncompleted cloud job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3DT ... "));
+                                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                                    std::ostringstream ostr;
+                                    ostr << "job_fullkspace" << "_" << j;
+                                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, jobList[j].res, ostr.str());
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            if ( !runJobsOnCloud )
+            {
+                // split the job
+                GADGET_CHECK_RETURN_FALSE(this->splitReconJob(&workOrder3DTJobSplit, kspaceIfftROPermuted, kerPermuted, workOrder3DT->job_split_by_S_, jobN, jobMegaBytes, overlapN, jobList));
+
+                GADGET_MSG("SPIRIT - 3DT - total job : " << jobList.size());
+
+                size_t j;
+                for ( j=0; j<jobList.size(); j++ )
+                {
+                    GADGET_MSG("SPIRIT - 3DT - job : " << j << " - size :" << jobList[j].job_index_endN_-jobList[j].job_index_startN_+1);
+
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("SPIRIT 3DT ... "));
+                    GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(jobList[j]));
+                    GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                    std::ostringstream ostr;
+                    ostr << "job_fullkspace" << "_" << j;
+                    GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, jobList[j].res, ostr.str());
+                }
+            }
+
+            // combine the job
+            workOrder3DTJobSplit.fullkspace_.create(E1, E2, dstCHA, RO, N);
+            GADGET_CHECK_RETURN_FALSE(this->combineReconJob(&workOrder3DTJobSplit, jobList, RO, N));
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DTJobSplit.fullkspace_, "job_combined_fullkspace");
+
+            // clear the memory
+            jobList.clear();
+
+            // permute the unwrapped kspace
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permtue RO to 1st dimension ... "));
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo1stDimensionFor3DRecon(workOrder3DTJobSplit.fullkspace_, kspaceIfftRO));
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftRO, "res_fullkspace_ROinIm");
+
+            // perform fft along the first dimension
+            GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(kspaceIfftRO, workOrder3DT->fullkspace_));
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->fullkspace_, "res_3DSpirit");
+        }
+        else
+        {
+            for ( n=0; n<(int)N; n++ )
+            {
+                size_t kernelN = n;
+                if ( kernelN >= refN ) kernelN = refN-1;
+
+                hoNDArray<T> kIm(E1, E2, RO, srcCHA, dstCHA, workOrder3DT->kernelIm_->begin()+kernelN*RO*E1*E2*srcCHA*dstCHA);
+
+                hoNDArray<T> aliasedKSpace(RO, E1, E2, srcCHA, const_cast<T*>(data_dst.begin())+n*RO*E1*E2*srcCHA);
+
+                hoNDArray<T> unwarppedKSpace(RO, E1, E2, dstCHA, workOrder3DT->fullkspace_.begin()+n*RO*E1*E2*dstCHA);
+
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D unwrapping ... "));
+                GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImpl(workOrder3DT, aliasedKSpace, kIm, unwarppedKSpace, n));
+                GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, unwarppedKSpace, "unwarppedKSpace");
+            }
+        }
+
+        hoNDArrayMemoryManaged<T> complexImMultiChannel(RO, E1, E2, dstCHA, N, gtPlus_mem_manager_);
+
+        if ( (workOrder3DT->coilMap_->get_size(0)==RO) 
+            && (workOrder3DT->coilMap_->get_size(1)==E1) 
+            && (workOrder3DT->coilMap_->get_size(2)==E2) 
+            && (workOrder3DT->coilMap_->get_size(3)==dstCHA) )
+        {
+            hoNDArrayMemoryManaged<T> complexImMultiChannel(RO, E1, E2, dstCHA, N, gtPlus_mem_manager_);
+            Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft3c(workOrder3DT->fullkspace_, complexImMultiChannel);
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, complexImMultiChannel, "unwarppedComplexIm");
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("spirit 3D coil combination ... "));
+            gtPlusISMRMRDReconUtilComplex<T>().coilCombine3D(complexImMultiChannel, *workOrder3DT->coilMap_, workOrder3DT->complexIm_);
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+            GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, workOrder3DT->complexIm_, "combined");
+        }
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwrapping(gtPlusReconWorkOrder3DT<T>* workOrder3DT, const hoNDArray<T>& data) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwarppingImplROPermuted(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& ker, hoNDArray<T>& /*coilMap*/, hoNDArray<T>& res)
+{
+    try
+    {
+        size_t E1 = kspace.get_size(0);
+        size_t E2 = kspace.get_size(1);
+        size_t RO = kspace.get_size(3);
+
+        size_t kerE1 = ker.get_size(0);
+        size_t kerE2 = ker.get_size(1);
+        size_t srcCHA = ker.get_size(2);
+        size_t dstCHA = ker.get_size(3);
+        size_t kerN = ker.get_size(5);
+
+        hoNDArray<T>* kerIm = &ker;
+        hoNDArray<T> kerImE1E2RO;
+        if ( kerE1!=E1 || kerE2!=E2 )
+        {
+            GADGET_MSG("gtPlusReconWorker3DTSPIRIT, kerE1!=E1 || kerE2!=E2, kernel needs to be converted along E1 and E2 ... ");
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("kernel conversion along E1 and E2 ... "));
+
+            if ( gtPlus_mem_manager_ )
+            {
+                // kerImE1E2RO will be cleared as all '0' 
+                kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN, (T*)(gtPlus_mem_manager_->allocate(sizeof(T)*(size_t)RO*E1*E2*srcCHA*dstCHA)));
+            }
+            else
+            {
+                kerImE1E2RO.create(E1, E2, srcCHA, dstCHA, RO, kerN);
+                Gadgetron::clear(kerImE1E2RO);
+            }
+
+            GADGET_CHECK_RETURN_FALSE(spirit_.imageDomainKernelE1E2RO(ker, E1, E2, kerImE1E2RO));
+            kerIm = &kerImE1E2RO;
+
+            GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        }
+
+        res.create(kspace.get_dimensions());
+
+        long long NUM = (long long)RO;
+
+        #ifdef USE_OMP
+            int numThreads = (NUM<16) ? NUM : 16;
+
+            int numOpenMPProcs = omp_get_num_procs();
+            GADGET_MSG("gtPlusReconWorker3DTSPIRIT, numOpenMPProcs : " << numOpenMPProcs);
+
+            int maxOpenMPThreads = omp_get_max_threads();
+            GADGET_MSG("gtPlusReconWorker3DTSPIRIT, maxOpenMPThreads : " << maxOpenMPThreads);
+
+            int allowOpenMPNested = omp_get_nested();
+
+            if ( NUM < numOpenMPProcs-2 )
+            {
+                omp_set_nested(1);
+                allowOpenMPNested = 1;
+            }
+            else
+            {
+                omp_set_nested(0);
+                allowOpenMPNested = 0;
+            }
+
+            GADGET_MSG("gtPlusReconWorker3DTSPIRIT, allowOpenMPNested : " << allowOpenMPNested);
+            GADGET_MSG("gtPlusReconWorker3DTSPIRIT, numThreads : " << numThreads);
+        #endif
+
+        long long t;
+
+        hoNDArray<T> ker_Shifted(kerIm);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(*kerIm, ker_Shifted);
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, ker_Shifted, "ker_Shifted");
+
+        hoNDArray<T> kspace_Shifted(kspace);
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifftshift2D(kspace, kspace_Shifted);
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspace_Shifted, "kspace_Shifted");
+
+        #ifdef GCC_OLD_FLAG
+            #pragma omp parallel default(none) private(t) shared(RO, E1, E2, srcCHA, dstCHA, workOrder3DT, kspace_Shifted, ker_Shifted, NUM) if ( NUM > 1 ) num_threads( numThreads )
+        #else
+            #pragma omp parallel default(none) private(t) shared(RO, E1, E2, srcCHA, dstCHA, workOrder3DT, NUM, kspace_Shifted, ker_Shifted, res) if ( NUM > 1 ) num_threads( numThreads )
+        #endif
+        {
+            gtPlusSPIRIT2DOperator<T> spirit;
+            spirit.setMemoryManager(gtPlus_mem_manager_);
+            spirit.use_symmetric_spirit_ = false;
+            spirit.use_non_centered_fft_ = true;
+
+            hoNDArray<T> x0(E1, E2, srcCHA);
+            Gadgetron::clear(x0);
+
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >* pCGSolver;
+
+            pCGSolver = new gtPlusLSQRSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >();
+
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >& cgSolver = *pCGSolver;
+
+            cgSolver.iterMax_ = workOrder3DT->spirit_iter_max_;
+            cgSolver.thres_ = workOrder3DT->spirit_iter_thres_;
+            cgSolver.printIter_ = workOrder3DT->spirit_print_iter_;
+
+            cgSolver.set(spirit);
+
+            hoNDArray<T> b(E1, E2, srcCHA);
+
+            #pragma omp for
+            for ( t=0; t<NUM; t++ )
+            {
+                size_t ro = t;
+
+                hoNDArray<T> kspaceCurr(E1, E2, srcCHA, kspace_Shifted.begin()+ro*E1*E2*srcCHA);
+                hoNDArray<T> resCurr(E1, E2, dstCHA, res.begin()+ro*E1*E2*dstCHA);
+
+                // solve the 2D spirit problem
+                Gadgetron::clear(x0);
+
+                boost::shared_ptr<hoNDArray<T> > kerCurr(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, ker_Shifted.begin()+ro*E1*E2*srcCHA*dstCHA));
+
+                spirit.setForwardKernel(kerCurr, false);
+
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspaceCurr.begin()));
+                spirit.setAcquiredPoints(acq);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *kerCurr, "spirit3D_ker");
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *acq, "spirit3D_kspace");
+
+                cgSolver.x0_ = acq.get();
+
+                // compute rhs
+                spirit.computeRighHandSide(*acq, b);
+
+                // solve
+                cgSolver.solve(b, resCurr);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, resCurr, "unwarppedKSpace_t");
+
+                // restore the acquired points
+                spirit.restoreAcquiredKSpace(*acq, resCurr);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, resCurr, "unwarppedKSpace_t_setAcq");
+            }
+
+            delete pCGSolver;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "res_Shifted");
+
+        Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fftshift2D(res, kspace_Shifted);
+        res = kspace_Shifted;
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "resPermuted");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwarppingImplROPermuted(...) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconWorkOrder<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res, size_t n)
+{
+    try
+    {
+        // RO, E1, E2, srcCHA, dstCHA, N
+        size_t RO = kspace.get_size(0);
+        size_t E1 = kspace.get_size(1);
+        size_t E2 = kspace.get_size(2);
+
+        size_t srcCHA = adj_forward_G_I.get_size(3);
+        size_t dstCHA = adj_forward_G_I.get_size(4);
+
+        // perform the 3D recon by read-out decoupling
+        hoNDArrayMemoryManaged<T> resDecoupled(E1, E2, dstCHA, RO, gtPlus_mem_manager_);
+
+        hoNDArrayMemoryManaged<T> kspaceIfftRO(RO, E1, E2, srcCHA, gtPlus_mem_manager_);
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->ifft1c(kspace, kspaceIfftRO));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftRO, "kspaceIfftRO");
+
+        hoNDArrayMemoryManaged<T> kspaceIfftROPermuted(E1, E2, srcCHA, RO, gtPlus_mem_manager_);
+
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permtue RO to 4th dimension ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo4thDimensionFor3DRecon(kspaceIfftRO, kspaceIfftROPermuted));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, kspaceIfftROPermuted, "kspaceIfftROPermuted");
+
+        T* pKspaceIfftROPermuted = kspaceIfftROPermuted.begin();
+
+        T* pG_I = adj_forward_G_I.begin();
+
+        long long NUM = (long long)RO;
+
+        long long t;
+
+        #pragma omp parallel default(none) private(t) shared(RO, E1, E2, srcCHA, dstCHA, workOrder3DT, NUM, resDecoupled, pKspaceIfftROPermuted, pG_I) if ( NUM > 6 ) num_threads( (NUM<16) ? NUM : 16 )
+        {
+            hoNDArrayMemoryManaged<T> adjForG_I_Decoupled(E1, E2, srcCHA, dstCHA, gtPlus_mem_manager_);
+            T* pDecoupledG_I = adjForG_I_Decoupled.begin();
+
+            gtPlusSPIRIT2DOperator<T> spirit;
+            spirit.setMemoryManager(gtPlus_mem_manager_);
+            spirit.use_symmetric_spirit_ = false;
+
+            hoNDArray<T> x0(E1, E2, srcCHA);
+            Gadgetron::clear(x0);
+
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >* pCGSolver;
+            pCGSolver = new gtPlusLSQRSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >();
+            gtPlusLinearSolver<hoNDArray<T>, hoNDArray<T>, gtPlusSPIRIT2DOperator<T> >& cgSolver = *pCGSolver;
+
+            cgSolver.iterMax_ = workOrder3DT->spirit_iter_max_;
+            cgSolver.thres_ = workOrder3DT->spirit_iter_thres_;
+            cgSolver.printIter_ = workOrder3DT->spirit_print_iter_;
+
+            cgSolver.set(spirit);
+
+            hoNDArray<T> b(E1, E2, srcCHA);
+
+            #pragma omp for
+            for ( t=0; t<NUM; t++ )
+            {
+                size_t ro = t;
+
+                hoNDArray<T> kspace_DeDecoupled(E1, E2, srcCHA, pKspaceIfftROPermuted+ro*E1*E2*srcCHA);
+                hoNDArray<T> resCurr(E1, E2, dstCHA, resDecoupled.begin()+ro*E1*E2*dstCHA);
+
+                // fill in kernel and kspace
+                size_t e1, e2, scha, dcha;
+
+                for ( dcha=0; dcha<dstCHA; dcha++)
+                {
+                    for ( scha=0; scha<srcCHA; scha++)
+                    {
+
+                        T* pDst = pDecoupledG_I + scha*E1*E2+dcha*E1*E2*srcCHA;
+                        T* pSrc = pG_I + ro*E1*E2+scha*RO*E1*E2+dcha*RO*E1*E2*srcCHA;
+                        memcpy(pDst, pSrc, sizeof(T)*E1*E2);
+                    }
+                }
+
+                // solve the 2D spirit problem
+                Gadgetron::clear(x0);
+
+                boost::shared_ptr<hoNDArray<T> > ker(new hoNDArray<T>(E1, E2, srcCHA, dstCHA, pDecoupledG_I));
+
+                spirit.setForwardKernel(ker, false);
+
+                boost::shared_ptr<hoNDArray<T> > acq(new hoNDArray<T>(E1, E2, srcCHA, kspace_DeDecoupled.begin()));
+                spirit.setAcquiredPoints(acq);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *ker, "spirit3D_ker");
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, *acq, "spirit3D_kspace");
+
+                cgSolver.x0_ = acq.get();
+
+                // compute rhs
+                spirit.computeRighHandSide(*acq, b);
+
+                // solve
+                cgSolver.solve(b, resCurr);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, resCurr, "unwarppedKSpace_t");
+
+                // restore the acquired points
+                spirit.restoreAcquiredKSpace(*acq, resCurr);
+
+                GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, resCurr, "unwarppedKSpace_t_setAcq");
+            }
+
+            delete pCGSolver;
+        }
+
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, resDecoupled, "resDecoupled");
+
+        // permute the unwrapped kspace
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.start("permtue RO to 1st dimension ... "));
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::permuteROTo1stDimensionFor3DRecon(resDecoupled, kspaceIfftRO));
+        GADGET_CHECK_PERFORM(performTiming_, gt_timer3_.stop());
+
+        // perform fft along the first dimension
+        GADGET_CHECK_RETURN_FALSE(Gadgetron::hoNDFFT<typename realType<T>::Type>::instance()->fft1c(kspaceIfftRO, res));
+        GADGET_EXPORT_ARRAY_COMPLEX(debugFolder_, gt_exporter_, res, "res_3DSpirit");
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwarppingImpl(gtPlusReconWorkOrder3DT<T>* workOrder3DT, hoNDArray<T>& kspace, hoNDArray<T>& adj_forward_G_I, hoNDArray<T>& res) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::
+performUnwarppingImpl(gtPlusReconJob2DT<T>& job)
+{
+    try
+    {
+        hoNDArray<T>& kspace = job.kspace; // [E1 E2 srcCHA RO 1]
+        hoNDArray<T>& ker = job.ker; // [E1 E2 srcCHA dstCHA RO 1]
+        hoNDArray<T>& res = job.res; // [E1 E2 dstCHA RO 1]
+        gtPlusReconWorkOrder<T>* workOrder3DT = &(job.workOrder2DT);
+
+        job.res = job.kspace;
+
+        GADGET_CHECK_RETURN_FALSE(this->performUnwarppingImplROPermuted(workOrder3DT, kspace, ker, *job.workOrder2DT.coilMap_, res));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTSPIRIT<T>::performUnwarppingImpl(job) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> 
+bool gtPlusReconWorker3DTSPIRIT<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT)
+{
+    try
+    {
+        GADGET_CHECK_RETURN_FALSE(workOrder3DT!=NULL);
+
+        spirit_.gtPlus_mem_manager_ = this->gtPlus_mem_manager_;
+
+        // call the BaseClass
+        GADGET_CHECK_RETURN_FALSE(BaseClass::performRecon(workOrder3DT));
+    }
+    catch(...)
+    {
+        GADGET_ERROR_MSG("Errors in gtPlusReconWorker3DTSPIRIT<T>::performRecon(gtPlusReconWorkOrder3DT<T>* workOrder3DT) ... ");
+        return false;
+    }
+
+    return true;
+}
+
+}}
diff --git a/toolboxes/mri/CMakeLists.txt b/toolboxes/mri/CMakeLists.txt
new file mode 100644
index 0000000..88128fd
--- /dev/null
+++ b/toolboxes/mri/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(pmri)
diff --git a/toolboxes/mri/pmri/CMakeLists.txt b/toolboxes/mri/pmri/CMakeLists.txt
new file mode 100644
index 0000000..2244056
--- /dev/null
+++ b/toolboxes/mri/pmri/CMakeLists.txt
@@ -0,0 +1,3 @@
+IF (CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/mri/pmri/gpu/CMakeLists.txt b/toolboxes/mri/pmri/gpu/CMakeLists.txt
new file mode 100644
index 0000000..7d932bc
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/CMakeLists.txt
@@ -0,0 +1,73 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUPMRI__)
+ADD_DEFINITIONS(-DWIN32_LEAN_AND_MEAN)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+find_package(CULA REQUIRED)
+
+include_directories(
+  ${CUDA_INCLUDE_DIRS}
+  ${CULA_INCLUDE_DIR} 
+  ${Boost_INCLUDE_DIR} 
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers/gpu
+  )
+
+cuda_add_library(gpuparallelmri SHARED 
+    b1_map.h
+    cuCartesianSenseOperator.h
+    cuNonCartesianKtSenseOperator.h
+    cuNonCartesianSenseOperator.h
+    cuSenseBuffer.h
+    cuSenseBufferCg.h
+    cuSenseOperator.h
+    gpupmri_export.h
+    htgrappa.h
+    senseOperator.h
+    sense_utilities.h
+    b1_map.cu
+    b1_map_NIH_Souheil.cu
+    sense_utilities.cu
+    cuSenseOperator.cu
+    cuCartesianSenseOperator.cu
+    cuNonCartesianSenseOperator.cu
+    cuNonCartesianKtSenseOperator.cu
+    cuSenseBuffer.cpp
+    cuSenseBufferCg.cpp
+    htgrappa.cu
+  )
+
+target_link_libraries(gpuparallelmri 
+  gpucore 
+  gpunfft 
+  ${Boost_LIBRARIES}
+  ${FFTW3_LIBRARIES} 
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  ${CUDA_CUBLAS_LIBRARIES} 
+  ${CULA_LIBRARIES}
+  )
+
+install(TARGETS gpuparallelmri DESTINATION lib)
+
+install(FILES 
+	b1_map.h
+	sense_utilities.h
+	htgrappa.h
+	senseOperator.h
+	cuSenseOperator.h
+	cuCartesianSenseOperator.h
+	cuNonCartesianSenseOperator.h
+	cuNonCartesianKtSenseOperator.h
+	cuSenseBuffer.h
+	gpupmri_export.h
+DESTINATION include)
diff --git a/toolboxes/mri/pmri/gpu/b1_map.cu b/toolboxes/mri/pmri/gpu/b1_map.cu
new file mode 100644
index 0000000..b060a55
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1_map.cu
@@ -0,0 +1,733 @@
+#include "b1_map.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "real_utilities.h"
+#include "real_utilities_device.h"
+#include "complext.h"
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+
+#include <iostream>
+#include <cmath>
+
+using namespace std;
+
+namespace Gadgetron{
+
+  const int kernel_width = 7;
+
+  template<class REAL, unsigned int D> void smooth_correlation_matrices( cuNDArray<complext<REAL> >*, cuNDArray<complext<REAL> >*);
+  template<class REAL> boost::shared_ptr< cuNDArray<complext<REAL> > > extract_csm( cuNDArray<complext<REAL> >*, unsigned int, unsigned int);
+  template<class REAL> void set_phase_reference( cuNDArray<complext<REAL> >*, unsigned int, unsigned int);
+  template<class T> void find_stride( cuNDArray<T> *in, unsigned int dim, unsigned int *stride, std::vector<size_t> *dims );
+  template<class T> boost::shared_ptr< cuNDArray<T> > correlation( cuNDArray<T> *in );
+  template<class T> void rss_normalize( cuNDArray<T> *in_out, unsigned int dim );
+  
+  //
+  // Main method
+  //
+
+  template<class REAL, unsigned int D> boost::shared_ptr< cuNDArray<complext<REAL> > >
+  estimate_b1_map( cuNDArray<complext<REAL> > *data_in, int target_coils)
+  {
+
+    if( data_in->get_number_of_dimensions() < 2 ){
+      cout << endl << "estimate_b1_map:: dimensionality mismatch." << endl; 
+      return boost::shared_ptr< cuNDArray<complext<REAL> > >();
+    }
+
+    if( data_in->get_number_of_dimensions()-1 != D ){
+      cout << endl << "estimate_b1_map:: dimensionality mismatch." << endl; 
+      return boost::shared_ptr< cuNDArray<complext<REAL> > >();
+    }
+
+    int target_coils_int = 0;
+    if ((target_coils <= 0) || (target_coils > data_in->get_size(D))) {
+      target_coils_int = data_in->get_size(D);
+    } else {
+      target_coils_int = target_coils;
+    }
+
+    vector<unsigned int> image_dims, dims_to_xform;
+    unsigned int pixels_per_coil = 1;
+  
+    for( unsigned int i=0; i<D; i++ ){
+      image_dims.push_back(data_in->get_size(i));
+      dims_to_xform.push_back(i);
+      pixels_per_coil *= data_in->get_size(i);
+    }
+  
+    unsigned int ncoils = data_in->get_size(D);
+
+    // Make a copy of input data, but only the target coils
+    boost::shared_ptr< cuNDArray<complext<REAL> > > data_out;
+    if (0 && target_coils_int == ncoils) {
+      cuNDArray<complext<REAL> > *_data_out = new cuNDArray<complext<REAL> >(*data_in);
+      data_out = boost::shared_ptr< cuNDArray<complext<REAL> > >(_data_out);
+    } else {
+      std::vector<size_t> odims = *(data_in->get_dimensions().get());
+      odims[D] = target_coils_int;
+      cuNDArray<complext<REAL> > *_data_out = new cuNDArray<complext<REAL> >(&odims);
+      data_out = boost::shared_ptr< cuNDArray<complext<REAL> > >(_data_out);
+
+      //Now copy one coil at a time
+      unsigned int elements_per_coil = data_in->get_number_of_elements()/ncoils;
+      for (unsigned int i = 0; i < target_coils_int; i++) {
+	cudaMemcpy(data_out.get()->get_data_ptr()+i*elements_per_coil,
+		   data_in->get_data_ptr()+i*elements_per_coil,
+		   elements_per_coil*sizeof(complext<REAL>),
+		   cudaMemcpyDeviceToDevice);
+      }
+      ncoils = target_coils_int;
+    }
+  
+    // Normalize by the RSS of the coils
+    rss_normalize( data_out.get(), D );
+  
+    // Now calculate the correlation matrices
+    boost::shared_ptr<cuNDArray<complext<REAL> > > corrm = correlation( data_out.get() );
+    data_out.reset();
+  
+    // Smooth (onto copy of corrm)
+    cuNDArray<complext<REAL> > *_corrm_smooth = new cuNDArray<complext<REAL> >();
+    _corrm_smooth->create(corrm->get_dimensions().get());
+    boost::shared_ptr<cuNDArray<complext<REAL> > > corrm_smooth(_corrm_smooth);
+
+    smooth_correlation_matrices<REAL,D>( corrm.get(), corrm_smooth.get() );
+    corrm.reset();
+
+    // Get the dominant eigenvector for each correlation matrix.
+    boost::shared_ptr<cuNDArray<complext<REAL> > > csm = extract_csm<REAL>( corrm_smooth.get(), ncoils, pixels_per_coil );
+    corrm_smooth.reset();
+  
+    // Set phase according to reference (coil 0)
+    set_phase_reference<REAL>( csm.get(), ncoils, pixels_per_coil );
+  
+    return csm;
+  }
+
+  template<class T> static void find_stride( cuNDArray<T> *in, unsigned int dim,
+					     unsigned int *stride, std::vector<size_t> *dims )
+  {
+    *stride = 1;
+    for( unsigned int i=0; i<in->get_number_of_dimensions(); i++ ){
+      if( i != dim )
+	dims->push_back(in->get_size(i));
+      if( i < dim )
+	*stride *= in->get_size(i);
+    }
+  }
+  
+  template<class REAL, class T> __inline__  __device__ REAL
+  _rss( unsigned int idx, T *in, unsigned int stride, unsigned int number_of_batches )
+  {
+    unsigned int in_idx = (idx/stride)*stride*number_of_batches+(idx%stride);
+    REAL rss = REAL(0);
+    
+    for( unsigned int i=0; i<number_of_batches; i++ ) 
+      rss += norm(in[i*stride+in_idx]);
+    
+    rss = std::sqrt(rss); 
+    
+    return rss;
+  }
+  
+  template<class T> __global__ void
+  rss_normalize_kernel( T *in_out, unsigned int stride, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    typedef typename realType<T>::Type REAL;
+
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    if( idx < number_of_elements ){
+      
+      REAL reciprocal_rss = 1/(_rss<REAL,T>(idx, in_out, stride, number_of_batches));
+      
+      unsigned int in_idx = (idx/stride)*stride*number_of_batches+(idx%stride);
+      
+      for( unsigned int i=0; i<number_of_batches; i++ ) {
+	T out = in_out[i*stride+in_idx];
+	out *= reciprocal_rss; // complex-scalar multiplication (element-wise operator)
+	in_out[i*stride+in_idx] = out; 
+      } 
+    }
+  }
+  
+  // Normalized RSS
+  template<class T>
+  void rss_normalize( cuNDArray<T> *in_out, unsigned int dim )
+  {
+    unsigned int number_of_batches = in_out->get_size(dim);
+    unsigned int number_of_elements = in_out->get_number_of_elements()/number_of_batches;
+    
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( number_of_elements, &blockDim, &gridDim );
+
+    // Find element stride
+    unsigned int stride; std::vector<size_t> dims;
+    find_stride<T>( in_out, dim, &stride, &dims );
+
+    // Invoke kernel
+    rss_normalize_kernel<T><<< gridDim, blockDim >>>( in_out->get_data_ptr(), stride, number_of_batches, number_of_elements );
+ 
+    CHECK_FOR_CUDA_ERROR();    
+  }
+
+  template<class REAL, class T> __global__ void
+  correlation_kernel( T *in, T *corrm, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int p = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int i = threadIdx.y;
+    
+    if( p < num_elements ){
+      for( unsigned int j=0; j<i; j++){
+	T tmp = in[i*num_elements+p]*conj(in[j*num_elements+p]);
+	corrm[(j*num_batches+i)*num_elements+p] = tmp;
+	corrm[(i*num_batches+j)*num_elements+p] = conj(tmp);
+      }
+      T tmp = in[i*num_elements+p];
+      corrm[(i*num_batches+i)*num_elements+p] = tmp*conj(tmp);
+    }
+  }
+  
+  // Build correlation matrix
+  template<class T> boost::shared_ptr< cuNDArray<T> > correlation( cuNDArray<T> *in )
+  {
+    typedef typename realType<T>::Type REAL;
+    // Prepare internal array
+    int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+
+    unsigned int number_of_batches = in->get_size(in->get_number_of_dimensions()-1);
+    unsigned int number_of_elements = in->get_number_of_elements()/number_of_batches;
+
+    int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+    int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+    dim3 blockDim(((max_blockdim/number_of_batches)/warp_size)*warp_size, number_of_batches);
+
+    if( blockDim.x == 0 ){
+      throw std::runtime_error("correlation: correlation dimension exceeds device capacity.");
+    }
+  
+    dim3 gridDim((number_of_elements+blockDim.x-1)/blockDim.x);
+
+    // Invoke kernel
+    std::vector<size_t> dims = *in->get_dimensions(); dims.push_back(number_of_batches);
+    boost::shared_ptr< cuNDArray<T> > out( new cuNDArray<T> );
+    out->create(&dims);
+
+    correlation_kernel<REAL,T><<< gridDim, blockDim >>>( in->get_data_ptr(), out->get_data_ptr(), number_of_batches, number_of_elements );
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    return out;
+  }
+
+  // Smooth correlation matrices by box filter (1D)
+  template<class REAL> __global__ void
+  smooth_correlation_matrices_kernel( complext<REAL> *corrm, complext<REAL> *corrm_smooth, intd<1>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const int co = idx;    
+      const int x = co;
+    
+      const int size_x = image_dims.vec[0];
+    
+      const REAL scale = REAL(1)/((REAL)kernel_width);
+    
+      complext<REAL> result = complext<REAL>(0);
+    
+      for (int kx = 0; kx < kernel_width; kx++) {
+      
+	if ((x-(kernel_width>>1)+kx) >= 0 &&
+	    (x-(kernel_width>>1)+kx) < size_x)
+	  {	    
+	    int source_offset = 
+	      batch*num_image_elements +
+	      (x-(kernel_width>>1)+kx);
+	  
+	    result += corrm[source_offset];
+	  }
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  // Smooth correlation matrices by box filter (2D)
+  template<class REAL> __global__ void
+  smooth_correlation_matrices_kernel( complext<REAL> *corrm, complext<REAL> *corrm_smooth, intd<2>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const intd2 co = idx_to_co<2>(idx, image_dims);
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+    
+      const int half_width = kernel_width>>1;
+
+      const int yminus = y-half_width;
+      const int xminus = x-half_width;
+      const int yplus = y+half_width;
+      const int xplus = x+half_width;
+
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+   
+      if( (yminus >=0) ){
+	if( yplus < size_y ){
+	  if( xminus >= 0 ){
+	    if( xplus < size_x ){
+
+#pragma unroll
+	      for (int ky = 0; ky < kernel_width; ky++){
+#pragma unroll
+		for (int kx = 0; kx < kernel_width; kx++) {
+		
+		  int cy = yminus+ky;
+		  int cx = xminus+kx;
+		
+		  int source_offset = batch*num_image_elements + cy*size_x + cx;
+		  result += corrm[source_offset];
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  // Smooth correlation matrices by box filter (3D)
+  template<class REAL> __global__ void
+  smooth_correlation_matrices_kernel( complext<REAL> *corrm, complext<REAL> *corrm_smooth, intd<3>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const intd3 co = idx_to_co<3>(idx, image_dims);
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+      const int z = co.vec[2];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+      const int size_z = image_dims.vec[2];
+    
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+    
+      for (int kz = 0; kz < kernel_width; kz++) {
+	for (int ky = 0; ky < kernel_width; ky++) {
+	  for (int kx = 0; kx < kernel_width; kx++) {
+	
+	    if ((z-(kernel_width>>1)+kz) >= 0 &&
+		(z-(kernel_width>>1)+kz) < size_z &&
+		(y-(kernel_width>>1)+ky) >= 0 &&
+		(y-(kernel_width>>1)+ky) < size_y &&
+		(x-(kernel_width>>1)+kx) >= 0 &&
+		(x-(kernel_width>>1)+kx) < size_x) 
+	      {	    
+		int source_offset = 
+		  batch*num_image_elements +
+		  (z-(kernel_width>>1)+kz)*size_x*size_y +
+		  (y-(kernel_width>>1)+ky)*size_x +
+		  (x-(kernel_width>>1)+kx);
+	    
+		result += corrm[source_offset];
+	      }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  // Smooth correlation matrices by box filter (3D)
+  template<class REAL> __global__ void
+  smooth_correlation_matrices_kernel( complext<REAL> *corrm, complext<REAL> *corrm_smooth, intd<4>::Type image_dims )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < num_image_elements ){
+    
+      const intd4 co = idx_to_co<4>(idx, image_dims);
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+      const int z = co.vec[2];
+      const int w = co.vec[3];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+      const int size_z = image_dims.vec[2];    
+      const int size_w = image_dims.vec[3];
+    
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width*kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+    
+      for (int kw = 0; kw < kernel_width; kw++) {
+	for (int kz = 0; kz < kernel_width; kz++) {
+	  for (int ky = 0; ky < kernel_width; ky++) {
+	    for (int kx = 0; kx < kernel_width; kx++) {
+	
+	      if ((w-(kernel_width>>1)+kw) >= 0 &&
+		  (w-(kernel_width>>1)+kw) < size_w &&
+		  (z-(kernel_width>>1)+kz) >= 0 &&
+		  (z-(kernel_width>>1)+kz) < size_z &&
+		  (y-(kernel_width>>1)+ky) >= 0 &&
+		  (y-(kernel_width>>1)+ky) < size_y &&
+		  (x-(kernel_width>>1)+kx) >= 0 &&
+		  (x-(kernel_width>>1)+kx) < size_x) 
+		{	    
+		  int source_offset = 
+		    batch*num_image_elements +
+		    (w-(kernel_width>>1)+kw)*size_x*size_y*size_z +
+		    (z-(kernel_width>>1)+kz)*size_x*size_y +
+		    (y-(kernel_width>>1)+ky)*size_x +
+		    (x-(kernel_width>>1)+kx);
+	    
+		  result += corrm[source_offset];
+		}
+	    }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+idx] = scale*result;
+    }
+  }
+
+  __device__ int _min( int A, int B ){
+    return (A<B) ? A : B;
+  }
+
+  // Smooth correlation matrices border by box filter (2D)
+  template<class REAL> __global__ void
+  smooth_correlation_matrices_border_kernel( complext<REAL> *corrm, complext<REAL> *corrm_smooth, intd<2>::Type image_dims, unsigned int number_of_border_threads )
+  {
+    const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const int batch = blockIdx.y;
+
+    const int num_image_elements = prod(image_dims);
+
+    if( idx < number_of_border_threads ){
+    
+      intd2 co;
+      const int half_width = kernel_width>>1;
+
+      co.vec[1] = idx/image_dims.vec[0];
+      co.vec[1] = _min(co.vec[1], half_width );
+    
+      if( co.vec[1] == half_width ){
+	int new_idx = idx-half_width*image_dims.vec[0];
+	int num_skips = new_idx/half_width;
+	int rows_offset = _min(num_skips>>1, image_dims.vec[1]-(half_width<<1) );
+	co.vec[1] += rows_offset;
+
+	if( co.vec[1] == (half_width + image_dims.vec[1]-(half_width<<1)) ){
+	  new_idx -= ((image_dims.vec[1]-(half_width<<1))*(half_width<<1));
+	  co.vec[1] += (new_idx / image_dims.vec[0]);
+	  co.vec[0] = (new_idx % image_dims.vec[0]);
+	}
+	else{
+	  co.vec[0] = (num_skips%2)*(image_dims.vec[0]-half_width) + (new_idx%half_width);
+	}
+      }
+      else{
+	co.vec[0] = idx%image_dims.vec[0];
+      }
+    
+      const int x = co.vec[0];
+      const int y = co.vec[1];
+    
+      const int size_x = image_dims.vec[0];
+      const int size_y = image_dims.vec[1];
+    
+      const int yminus = y-half_width;
+      const int xminus = x-half_width;
+
+      const REAL scale = REAL(1)/((REAL)(kernel_width*kernel_width));
+    
+      complext<REAL> result = complext<REAL>(0);
+ 
+#pragma unroll
+      for (int ky = 0; ky < kernel_width; ky++) {
+#pragma unroll
+	for (int kx = 0; kx < kernel_width; kx++) {
+	
+	  if( (yminus+ky >=0) ){
+	    if( yminus+ky < size_y ){
+	      if( xminus+kx >= 0 ){
+		if( xminus+kx < size_x ){
+		
+		  int source_offset = 
+		    batch*num_image_elements +
+		    (yminus+ky)*size_x +
+		    (xminus+kx);
+		
+		  result += corrm[source_offset];
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      corrm_smooth[batch*num_image_elements+co_to_idx<2>(co,image_dims)] = scale*result;  
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  smooth_correlation_matrices( cuNDArray<complext<REAL> > *corrm, cuNDArray<complext<REAL> > *corrm_smooth )
+  {
+    typename intd<D>::Type image_dims;
+
+    for( unsigned int i=0; i<D; i++ ){
+      image_dims.vec[i] = corrm->get_size(i);
+    }
+  
+    unsigned int number_of_batches = 1;
+  
+    for( unsigned int i=D; i<corrm->get_number_of_dimensions(); i++ ){
+      number_of_batches *= corrm->get_size(i);
+    }
+  
+    int device; cudaGetDevice( &device );
+    cudaDeviceProp deviceProp; cudaGetDeviceProperties( &deviceProp, device );
+
+    dim3 blockDim(deviceProp.maxThreadsPerBlock);
+    dim3 gridDim((unsigned int) std::ceil((double)prod(image_dims)/blockDim.x), number_of_batches);
+
+    smooth_correlation_matrices_kernel<REAL><<<gridDim, blockDim>>>
+      ( corrm->get_data_ptr(), corrm_smooth->get_data_ptr(), image_dims );
+  
+    CHECK_FOR_CUDA_ERROR();
+
+    unsigned int number_of_border_threads = ((kernel_width>>1)<<1)*(sum(image_dims)-((kernel_width>>1)<<1));
+    blockDim = dim3(128);
+    gridDim = dim3((unsigned int) std::ceil((double)number_of_border_threads/blockDim.x), number_of_batches);
+  
+    smooth_correlation_matrices_border_kernel<REAL><<<gridDim, blockDim>>>
+      ( corrm->get_data_ptr(), corrm_smooth->get_data_ptr(), image_dims, number_of_border_threads );
+
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  extern __shared__ char shared_mem[];
+
+  // Extract CSM
+  template<class REAL> __global__ void
+  extract_csm_kernel( complext<REAL> *corrm, complext<REAL> *csm, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int i = threadIdx.x;
+  
+    if( idx < num_elements ){    
+    
+      // Get the dominant eigenvector for each correlation matrix.
+      // Copying Peter Kellman's approach we use the power method:
+      //  b_k+1 = A*b_k / ||A*b_k||
+    
+      complext<REAL> *data_out = (complext<REAL>*) shared_mem;
+      complext<REAL> *tmp_v = &(((complext<REAL>*) shared_mem)[num_batches*blockDim.x]);
+    
+      const unsigned int iterations = 2;
+    
+      for( unsigned int c=0; c<num_batches; c++){
+	data_out[c*blockDim.x+i] = complext<REAL>(1);
+      }
+    
+      for( unsigned int it=0; it<iterations; it++ ){
+      
+	for( unsigned int c=0; c<num_batches; c++){
+	  tmp_v[c*blockDim.x+i] = complext<REAL>(0);
+	}
+      
+	for( unsigned j=0; j<num_batches; j++){
+	  for( unsigned int k=0; k<num_batches; k++){
+	    tmp_v[j*blockDim.x+i] += corrm[(k*num_batches+j)*num_elements+idx]*data_out[k*blockDim.x+i];
+	  }
+	}
+      
+	REAL tmp = REAL(0);
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  tmp += norm(tmp_v[c*blockDim.x+i]);
+	}
+      
+	tmp = 1/std::sqrt(tmp);
+
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  complext<REAL> res = tmp*tmp_v[c*blockDim.x+i];
+	  data_out[c*blockDim.x+i] = res;
+	}
+      }
+    
+      for (unsigned int c=0; c<num_batches; c++){
+	csm[c*num_elements+idx] = data_out[c*blockDim.x+i];
+      }
+    }
+  }
+
+  // Extract CSM
+  template<class REAL> __global__ void
+  extract_csm_kernel( complext<REAL> *corrm, complext<REAL> *csm, unsigned int num_batches, unsigned int num_elements, complext<REAL> *tmp_v )
+  {
+    const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if( idx < num_elements ){    
+    
+      // Get the dominant eigenvector for each correlation matrix.
+      // Copying Peter Kellman's approach we use the power method:
+      //  b_k+1 = A*b_k / ||A*b_k||
+    
+      const unsigned int iterations = 2;
+
+      for( unsigned int c=0; c<num_batches; c++){
+	csm[c*num_elements+idx] = complext<REAL>(1);
+      }
+    
+      for( unsigned int it=0; it<iterations; it++ ){
+
+	for( unsigned int c=0; c<num_batches; c++){
+	  tmp_v[c*num_elements+idx] = complext<REAL>(0);
+	}
+      
+	for( unsigned j=0; j<num_batches; j++){
+	  for( unsigned int k=0; k<num_batches; k++){
+	    typedef complext<REAL> T;
+	    tmp_v[j*num_elements+idx] += corrm[(k*num_batches+j)*num_elements+idx]*csm[k*num_elements+idx];
+	  }
+	}
+
+	REAL tmp = REAL(0);
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  tmp += norm(tmp_v[c*num_elements+idx]);
+	}
+      
+	tmp = 1/std::sqrt(tmp);
+
+      
+	for (unsigned int c=0; c<num_batches; c++){
+	  complext<REAL> res = tmp*tmp_v[c*num_elements+idx];
+	  csm[c*num_elements+idx] = res;
+	}
+      }
+    }
+  }
+
+  // Extract CSM
+  template<class REAL> __host__ 
+  boost::shared_ptr<cuNDArray<complext<REAL> > > extract_csm(cuNDArray<complext<REAL> > *corrm_in, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    vector<size_t> image_dims;
+
+    for( unsigned int i=0; i<corrm_in->get_number_of_dimensions()-1; i++ ){
+      image_dims.push_back(corrm_in->get_size(i));
+    }
+  
+    // Allocate output
+    cuNDArray<complext<REAL> > *out = new cuNDArray<complext<REAL> >; out->create(&image_dims);
+
+    dim3 blockDim(256);
+    dim3 gridDim((unsigned int) std::ceil((double)number_of_elements/blockDim.x));
+
+    /*  
+	if( out != 0x0 )
+	extract_csm_kernel<REAL><<< gridDim, blockDim, number_of_batches*blockDim.x*2*sizeof(complext<REAL>) >>>
+	( corrm_in->get_data_ptr(), out->get_data_ptr(), number_of_batches, number_of_elements );
+    */
+
+    // Temporary buffer. TODO: use shared memory
+    cuNDArray<complext<REAL> > *tmp_v = new cuNDArray<complext<REAL> >; tmp_v->create(&image_dims);
+
+    if( out != 0x0 && tmp_v != 0x0 )
+      extract_csm_kernel<REAL><<< gridDim, blockDim >>>
+	( corrm_in->get_data_ptr(), out->get_data_ptr(), number_of_batches, number_of_elements, tmp_v->get_data_ptr() );
+
+    CHECK_FOR_CUDA_ERROR();
+  
+    delete tmp_v;
+    return boost::shared_ptr<cuNDArray<complext<REAL> > >(out);
+  }
+
+  // Set refence phase
+  template<class REAL> __global__ void
+  set_phase_reference_kernel( complext<REAL> *csm, unsigned int num_batches, unsigned int num_elements )
+  {
+    const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if( idx < num_elements ){
+      REAL angle = arg<REAL>(csm[idx]); //Phase of the first coil
+      REAL sin_a, cos_a; gad_sincos( angle, &sin_a, &cos_a );
+
+      complext<REAL> tmp;
+      tmp.vec[0] = cos_a; tmp.vec[1] = sin_a;
+      tmp = conj(tmp);
+
+      for( unsigned int c=0; c<num_batches; c++ ){
+	complext<REAL> val = csm[c*num_elements+idx];
+	typedef complext<REAL> T;
+	val = val*tmp;
+	csm[c*num_elements+idx] = val;
+      }
+    }
+  }
+  
+  // Set reference phase
+  template<class REAL> __host__ 
+  void set_phase_reference(cuNDArray<complext<REAL> > *csm, unsigned int number_of_batches, unsigned int number_of_elements )
+  {
+    dim3 blockDim(128);
+    dim3 gridDim((unsigned int) std::ceil((double)number_of_elements/blockDim.x));
+  
+    set_phase_reference_kernel<REAL><<< gridDim, blockDim >>>( csm->get_data_ptr(), number_of_batches, number_of_elements );
+  
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+
+
+  //
+  // Template instantiation
+  //
+
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,1>(cuNDArray<complext<float> >*, int);
+  template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,2>(cuNDArray<complext<float> >*, int);
+  //template boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,3>(cuNDArray<complext<float> >*, int);
+  //template boost::shared_ptr< cuNDArray<complext<float> > > estimate_b1_map<float,4>(cuNDArray<complext<float> >*, int);
+
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,1>(cuNDArray<complext<double> >*, int);
+  template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,2>(cuNDArray<complext<double> >*, int);
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,3>(cuNDArray<complext<double> >*, int);
+  //template EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<double> > > estimate_b1_map<double,4>(cuNDArray<complext<double> >*, int);
+}
diff --git a/toolboxes/mri/pmri/gpu/b1_map.h b/toolboxes/mri/pmri/gpu/b1_map.h
new file mode 100644
index 0000000..5fb9994
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1_map.h
@@ -0,0 +1,32 @@
+/** \file b1_map.h
+    \brief Utility to estimate b1 maps (MRI coil sensitivities), GPU based. 
+*/
+
+#pragma once
+
+#include "gpupmri_export.h"
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "complext.h"
+
+#include <boost/smart_ptr.hpp>
+
+namespace Gadgetron{
+
+  /** 
+   * \brief Estimate b1 map (coil sensitivities) of single or double precision according to REAL and of dimensionality D.
+   * \param data Reconstructed reference images from the individual coils. Dimensionality is D+1 where the latter dimensions denotes the coil images.
+   * \param taget_coils Denotes the number of target coils. Cannot exceed the size of dimension D of the data. A negative value indicates that sensitivity maps are computed for the full coil image dimension.
+   */
+  template<class REAL, unsigned int D> EXPORTGPUPMRI boost::shared_ptr< cuNDArray<complext<REAL> > >
+  estimate_b1_map( cuNDArray<complext<REAL> > *data, int target_coils = -1 );
+
+    /** 
+   * \brief Estimate b1 map (coil sensitivities) of single or double precision using the NIH Souheil method
+   * \param data [RO E1 CHA] for single 2D or [RO E1 N CHA] for multiple 2D reconstructed reference images from the individual coils. 
+   */
+  template<class REAL> EXPORTGPUPMRI bool
+  estimate_b1_map_2D_NIH_Souheil( cuNDArray<complext<REAL> >* data, cuNDArray<complext<REAL> >* csm, size_t ks, size_t power,
+                                  cuNDArray<complext<REAL> >& D, cuNDArray<complext<REAL> >& DH_D, 
+                                  cuNDArray<complext<REAL> >& V1, cuNDArray<complext<REAL> >& U1 );
+}
diff --git a/toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu b/toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu
new file mode 100644
index 0000000..d649b4b
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1_map_NIH_Souheil.cu
@@ -0,0 +1,647 @@
+#include "b1_map.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "real_utilities.h"
+#include "real_utilities_device.h"
+#include "complext.h"
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+#include "hoNDArray_fileio.h"
+#include "GPUTimer.h"
+
+#include "CUBLASContextProvider.h"
+#include <cublas_v2.h>
+
+#include <iostream>
+#include <cmath>
+
+using namespace std;
+
+namespace Gadgetron{
+
+    template <class T> int write_cuNDArray_to_disk(cuNDArray<T>* a, const char* filename)
+    {
+        boost::shared_ptr< hoNDArray<T> > host = a->to_host();
+        write_nd_array<complext<float> >(host.get(), filename);
+        return 0;
+    }
+
+    extern __shared__ char _shared_mem[];
+
+    //
+    // Main method
+    //
+
+    template<class REAL> EXPORTGPUPMRI bool
+    estimate_b1_map_2D_NIH_Souheil( cuNDArray<complext<REAL> >* data, cuNDArray<complext<REAL> >* csm, size_t ks, size_t power, 
+                                    cuNDArray<complext<REAL> >& D, cuNDArray<complext<REAL> >& DH_D, 
+                                    cuNDArray<complext<REAL> >& V1, cuNDArray<complext<REAL> >& U1)
+    {
+        if( data->get_number_of_dimensions() < 2 )
+        {
+            cout << endl << "estimate_b1_map_2D_NIH_Souheil:: dimensionality mismatch." << endl; 
+            return false;
+        }
+
+        if ( !csm->dimensions_equal(data) )
+        {
+            csm->create(data->get_dimensions());
+        }
+
+        size_t kss = ks*ks;
+
+        {
+            assemble_D( data, &D, ks );
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "D.cplx";
+        //    write_cuNDArray_to_disk(&D, filename.c_str());
+        //}
+
+        {
+            computeDH_D( data, &D, &DH_D, kss );
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "DH_D.cplx";
+        //    write_cuNDArray_to_disk(&DH_D, filename.c_str());
+        //}
+
+        {
+            computeV1( data, &D, &DH_D, &V1, csm, power, kss);
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "V1.cplx";
+        //    write_cuNDArray_to_disk(&V1, filename.c_str());
+        //}
+
+        {
+            computeU1( data, &D, &V1, &U1, kss);
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "U1.cplx";
+        //    write_cuNDArray_to_disk(&U1, filename.c_str());
+        //}
+
+        {
+            extract_csm( data, &V1, &U1, csm, kss);
+        }
+
+        //{
+        //    std::string dstDir = "D:/software/Gadgetron/20130114/gadgetron/toolboxes/gtplus/ut/result/";
+        //    std::string filename = dstDir + "csm.cplx";
+        //    write_cuNDArray_to_disk(csm, filename.c_str());
+        //}
+
+        return true;
+    }
+
+    // assemble_D
+    template<class T> __global__ void
+    assemble_D_kernel( T* pData, T* pD, int RO, int E1, int N, int CHA, int kss, int halfKs )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int cha = threadIdx.y;
+
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        // printf("ro=%d, e1=%d, cha=%d, n=%d\n", ro, e1, cha, n);
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            // printf("ro=%d, e1=%d\n", ro, e1);
+
+            unsigned int idx2D = cha*RO*E1*kss*N + n*RO*E1 + ro + e1*RO;
+
+            int kro, ke1, de1, dro;
+
+            if ( e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs )
+            {
+                // printf("e1>=halfKs && e1<E1-halfKs && ro>=halfKs && ro<RO-halfKs\n");
+
+                const T* pDataCurr = pData + n*RO*E1 + cha*RO*E1*N;
+
+                int ind=0;
+                for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                {
+                    de1 = e1 + ke1;
+                    for ( kro=-halfKs; kro<=halfKs; kro++ )
+                    {
+                        pD[ind*RO*E1*N + idx2D] = pDataCurr[de1*RO+ro+kro];
+                        //printf("pD[idxD]=%f\n", pD[idxD].real());
+                        ind++;
+                    }
+                }
+            }
+            else
+            {
+                // printf("boundary\n");
+                const T* pDataCurr = pData + n*RO*E1 + cha*RO*E1*N;
+                int ind=0;
+                for ( ke1=-halfKs; ke1<=halfKs; ke1++ )
+                {
+                    de1 = e1 + ke1;
+                    if ( de1 < 0 ) de1 += E1;
+                    if ( de1 >= E1 ) de1 -= E1;
+
+                    for ( kro=-halfKs; kro<=halfKs; kro++ )
+                    {
+                        dro = ro + kro;
+                        if ( dro < 0 ) dro += RO;
+                        if ( dro >= RO ) dro -= RO;
+
+                        pD[ind*RO*E1*N+ idx2D] = pDataCurr[de1*RO+dro];
+                        ind++;
+                    }
+                }
+            }
+        }
+    }
+
+    template<class T>
+    void assemble_D( cuNDArray<T>* data, cuNDArray<T>* D, size_t ks )
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        if ( ks%2 != 1 )
+        {
+            ks++;
+        }
+
+        size_t halfKs = ks/2;
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        dim3 blockDim(((max_blockdim/CHA)/warp_size)*warp_size, CHA);
+
+        if( blockDim.x == 0 )
+        {
+            blockDim.x = warp_size;
+            while ( blockDim.x*CHA*CHA > max_blockdim && blockDim.x>1 )
+            {
+                blockDim.x /= 2;
+            }
+
+            if ( blockDim.x <= 1 )
+            {
+                blockDim.x = 1;
+            }
+        }
+
+        dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+        // Invoke kernel
+        assemble_D_kernel<T><<< gridDim, blockDim >>>( data->get_data_ptr(), D->get_data_ptr(), RO, E1, N, CHA, ks*ks, halfKs );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // compute DH_D
+    template<class T> __global__ void
+    computeDH_D_kernel( T* pD, T* pDH_D, int RO, int E1, int N, int CHA, int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        // DH_D, [RO E1 CHA CHA_Prime]
+        const unsigned int cha = threadIdx.y;
+        const unsigned int cha_prime = threadIdx.z;
+
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+
+            // every thread compute an element of DH_D for a pixel
+            int k;
+            T v;
+            v = 0;
+            for ( k=0; k<kss; k++ )
+            {
+                v += conj(pD[cha*RO*E1*N*kss + k*RO*E1*N + idx])*pD[cha_prime*RO*E1*N*kss + k*RO*E1*N + idx];
+            }
+
+            pDH_D[cha_prime*RO*E1*N*CHA + cha*RO*E1*N + idx] = v;
+        }
+    }
+
+    // use the shared memory
+    template<class T> __global__ void
+    computeDH_D_kernel3( T* pD, T* pDH_D, int RO, int E1, int N, int CHA, int kss, int ks, int num )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        // DH_D, [RO E1 CHA CHA_Prime]
+        const unsigned int cha = threadIdx.y;
+        const unsigned int cha_prime = threadIdx.z;
+
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+            unsigned int idxD = idx + cha*RO*E1*N*kss;
+            unsigned int idxShared = threadIdx.x*kss*CHA;
+
+            T *shared_mem = (T*) _shared_mem;
+
+            int k;
+
+            if ( cha_prime == 0 )
+            {
+                for ( k=0; k<kss; k++ )
+                {
+                    shared_mem[idxShared + k + cha*kss ] = pD[idxD + k*RO*E1*N ];
+                }
+            }
+
+            __syncthreads();
+
+            T v = conj(shared_mem[idxShared + cha*kss])*shared_mem[idxShared + cha_prime*kss];
+            for ( k=1; k<kss; k++ )
+            {
+                v += conj(shared_mem[idxShared + cha*kss + k])*shared_mem[idxShared + cha_prime*kss + k];
+            }
+
+            pDH_D[cha_prime*RO*E1*N*CHA + cha*RO*E1*N + idx] = v;
+        }
+    }
+
+    template<class T>
+    void computeDH_D( cuNDArray<T>* data, cuNDArray<T>* D, cuNDArray<T>* DH_D, size_t kss )
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        size_t shared_mem_per_block = cudaDeviceManager::Instance()->shared_mem_per_block(cur_device);
+
+        // estimate how many pixels a block can process
+        size_t ks = (size_t)std::sqrt((double)kss);
+
+        // size_t numOfPixels = shared_mem_per_block/4/(sizeof(T)*(kss+ks)*CHA);
+        size_t numOfPixels = shared_mem_per_block/4/(sizeof(T)*kss*CHA);
+
+        while ( numOfPixels*ks*CHA>max_blockdim && numOfPixels>0 )
+        {
+            numOfPixels--;
+        }
+
+        if ( numOfPixels > 0 )
+        {
+            dim3 blockDim(numOfPixels, CHA, CHA);
+
+            dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+            computeDH_D_kernel3<T><<< gridDim, blockDim, numOfPixels*sizeof(T)*kss*CHA >>>( D->get_data_ptr(), DH_D->get_data_ptr(), RO, E1, N, CHA, kss, ks, numOfPixels );
+        }
+        else
+        {
+            dim3 blockDim(((max_blockdim/(CHA*CHA))/warp_size)*warp_size, CHA, CHA);
+
+            if( blockDim.x == 0 )
+            {
+                blockDim.x = warp_size;
+                while ( blockDim.x*CHA*CHA > max_blockdim && blockDim.x>1 )
+                {
+                    blockDim.x /= 2;
+                }
+
+                if ( blockDim.x <= 1 )
+                {
+                    blockDim.x = 1;
+                }
+            }
+
+            dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+            // Invoke kernel
+            computeDH_D_kernel<T><<< gridDim, blockDim >>>( D->get_data_ptr(), DH_D->get_data_ptr(), RO, E1, N, CHA, kss );
+        }
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // compute V1
+    template<class T> __global__ void
+    computeV1_kernel( T* pD, T* pV1, int RO, int E1, int N, int CHA, int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int cha = threadIdx.y;
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+            unsigned int idxD = cha*RO*E1*N*kss + idx;
+
+            T v = 0;
+            for ( int ii=0; ii<kss; ii++ )
+            {
+                v += pD[idxD + ii*RO*E1*N];
+            }
+            pV1[cha*RO*E1*N + idx] = v;
+        }
+    }
+
+    template<class T> __global__ void
+    power_method_kernel( T* pDH_D, T* pV1, T* pV, unsigned int RO, unsigned int E1, unsigned int N, unsigned int CHA, unsigned int kss, unsigned int power )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int ro = blockIdx.x*blockDim.x+threadIdx.x;
+        const unsigned int e1 = blockIdx.y*blockDim.y+threadIdx.y;
+        unsigned int n = blockIdx.z;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int cha;
+
+            unsigned int idx2D = ro + e1*RO + n*RO*E1;
+
+            unsigned int N3D = RO*E1*N;
+
+            REAL v1Norm(0);
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                v1Norm += norm(pV1[cha*N3D + idx2D]);
+            }
+            v1Norm = ::sqrt(v1Norm);
+
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                pV1[cha*N3D + idx2D] /= v1Norm;
+            }
+
+            unsigned int po;
+            for ( po=0; po<power; po++ )
+            {
+                for( unsigned j=0; j<CHA; j++)
+                {
+                    T v = 0;
+                    for( unsigned int k=0; k<CHA; k++)
+                    {
+                        v += pDH_D[k*CHA*N3D+j*N3D+idx2D]*pV1[k*N3D+idx2D];
+                    }
+                    pV[j*N3D+idx2D] = v;
+                }
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    pV1[cha*N3D + idx2D] = pV[cha*N3D + idx2D];
+                }
+
+                v1Norm = 0;
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    v1Norm += norm(pV1[cha*N3D + idx2D]);
+                }
+                v1Norm = 1/std::sqrt(v1Norm);
+
+                for ( cha=0; cha<CHA; cha++ )
+                {
+                    pV1[cha*N3D + idx2D] *= v1Norm;
+                }
+            }
+        }
+    }
+
+    template<class T>
+    void computeV1( cuNDArray<T>* data, cuNDArray<T>* D, cuNDArray<T>* DH_D, cuNDArray<T>* V1, cuNDArray<T>* V, int power, int kss)
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        dim3 blockDim(((max_blockdim/CHA)/warp_size)*warp_size, CHA);
+
+        if( blockDim.x == 0 )
+        {
+            GADGET_ERROR_MSG("blockDim.x == 0");
+            throw std::runtime_error("computeDH_D: dimension exceeds device capacity.");
+        }
+
+        dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+        // Invoke kernel
+        computeV1_kernel<T><<< gridDim, blockDim >>>( D->get_data_ptr(), V1->get_data_ptr(), RO, E1, N, CHA, kss );
+
+        // power method
+        dim3 blockDim2(16, 16);
+        dim3 gridDim2((RO+blockDim2.x-1)/blockDim2.x, (E1+blockDim2.y-1)/blockDim2.y, N);
+
+        power_method_kernel<T><<< gridDim2, blockDim2 >>>( DH_D->get_data_ptr(), V1->get_data_ptr(), V->get_data_ptr(), RO, E1, N, CHA, kss, power );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // compute U1
+    template<class T> __global__ void
+    computeU1_kernel( T* pD, T* pV1, T* pU1, int RO, int E1, int N, int CHA, int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int k = threadIdx.y;
+        unsigned int n = (blockIdx.x*blockDim.x + threadIdx.x)/(RO*E1);
+        unsigned int e1 = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)/RO;
+        unsigned int ro = (blockIdx.x*blockDim.x + threadIdx.x - n*RO*E1)%RO;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+            unsigned int idxD = k*RO*E1*N + idx;
+
+            T v = 0;
+            for ( int ii=0; ii<CHA; ii++ )
+            {
+                v += pD[idxD + ii*kss*RO*E1*N] * pV1[ii*RO*E1*N+idx];
+            }
+            pU1[k*RO*E1*N + idx] = v;
+        }
+    }
+
+    template<class T>
+    void computeU1( cuNDArray<T>* data, cuNDArray<T>* D, cuNDArray<T>* V1, cuNDArray<T>* U1, int kss)
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        int cur_device = cudaDeviceManager::Instance()->getCurrentDevice();
+        int warp_size = cudaDeviceManager::Instance()->warp_size(cur_device);
+        int max_blockdim = cudaDeviceManager::Instance()->max_blockdim(cur_device);
+        dim3 blockDim(((max_blockdim/kss)/warp_size)*warp_size, kss);
+
+        if( blockDim.x == 0 )
+        {
+            // GADGET_ERROR_MSG("blockDim.x == 0");
+            blockDim.x = warp_size;
+            while ( blockDim.x*kss > max_blockdim && blockDim.x>1 )
+            {
+                blockDim.x /= 2;
+            }
+
+            if ( blockDim.x <= 1 )
+            {
+                blockDim.x = 1;
+            }
+        }
+
+        dim3 gridDim((RO*E1*N+blockDim.x-1)/blockDim.x);
+
+        // Invoke kernel
+        computeU1_kernel<T><<< gridDim, blockDim >>>( D->get_data_ptr(), V1->get_data_ptr(), U1->get_data_ptr(), RO, E1, N, CHA, kss );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    // extract the csm
+    template<class T> __global__ void
+    extract_csm_kernel( T* pV1, T* pU1, T* pCSM, unsigned int RO, unsigned int E1, unsigned int N, unsigned int CHA, unsigned int kss )
+    {
+        typedef typename realType<T>::Type REAL;
+
+        const unsigned int ro = blockIdx.x*blockDim.x+threadIdx.x;
+        const unsigned int e1 = blockIdx.y*blockDim.y+threadIdx.y;
+        unsigned int n = blockIdx.z;
+
+        if( ro<RO && e1<E1 && n<N )
+        {
+            unsigned int cha;
+            unsigned int idx = ro + e1*RO + n*RO*E1;
+
+            T phaseU1 = pU1[idx];
+            for ( int po=1; po<kss; po++ )
+            {
+                phaseU1 += pU1[idx + po*RO*E1*N];
+            }
+            phaseU1 /= abs(phaseU1);
+
+            // put the mean object phase to coil map
+            for ( cha=0; cha<CHA; cha++ )
+            {
+                pCSM[cha*RO*E1*N+idx] = phaseU1 * conj(pV1[cha*RO*E1*N+idx]);
+            }
+        }
+    }
+
+    template<class T>
+    void extract_csm( cuNDArray<T>* data, cuNDArray<T>* V1, cuNDArray<T>* U1, cuNDArray<T>* csm, int kss)
+    {
+        size_t RO = data->get_size(0);
+        size_t E1 = data->get_size(1);
+        size_t N(1), CHA;
+
+        size_t NDim = data->get_number_of_dimensions();
+
+        if ( NDim == 3 )
+        {
+            CHA = data->get_size(2);
+        }
+
+        if ( NDim == 4 )
+        {
+            N = data->get_size(2);
+            CHA = data->get_size(3);
+        }
+
+        // Setup block/grid dimensions
+        dim3 blockDim(16, 16);
+        dim3 gridDim((RO+blockDim.x-1)/blockDim.x, (E1+blockDim.y-1)/blockDim.y, N);
+
+        extract_csm_kernel<T><<< gridDim, blockDim >>>( V1->get_data_ptr(), U1->get_data_ptr(), csm->get_data_ptr(), RO, E1, N, CHA, kss );
+
+        CHECK_FOR_CUDA_ERROR();
+    }
+
+    //
+    // Template instantiation
+    //
+    template EXPORTGPUPMRI bool estimate_b1_map_2D_NIH_Souheil<float>( cuNDArray<complext<float> >* data, cuNDArray<complext<float> >* csm, size_t ks, size_t power,
+                                    cuNDArray<complext<float> >& D, cuNDArray<complext<float> >& DH_D, 
+                                    cuNDArray<complext<float> >& V1, cuNDArray<complext<float> >& U1 );
+}
diff --git a/toolboxes/mri/pmri/gpu/b1map_test.cu b/toolboxes/mri/pmri/gpu/b1map_test.cu
new file mode 100644
index 0000000..0a77192
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/b1map_test.cu
@@ -0,0 +1,48 @@
+#include "b1_map.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDArray.h"
+#include "ndarray_vector_td_utilities.hcu"
+#include "NFFT.h"
+#include "check_CUDA.h"
+
+#include <cutil.h>
+#include <iostream>
+
+using namespace std;
+using namespace Gadgetron;
+int main( int argc, char** argv) 
+{
+  hoNDArray<float_complext::Type> host_data = 
+    read_nd_array<float_complext::Type>("b1_mapping_data/coil_images.cplx");
+  
+  //hoNDArray<float_complext::Type> host_data = 
+  //read_nd_array<float_complext::Type>("b1_mapping_data/5ch.cplx");
+  
+  if( host_data.get_number_of_dimensions() != 3 ){
+    printf("\nInput data is not three-dimensional (a series of images). Quitting!\n");
+    exit(1);
+  }
+  
+  // Copy the image data to the device
+  cuNDArray<float_complext::Type> device_data(host_data);
+  
+  unsigned int timer; cutCreateTimer(&timer); double time;
+  printf("\nComputing CSM..."); fflush(stdout);
+  cutResetTimer( timer ); cutStartTimer( timer );
+  
+  // Compute CSM
+  boost::shared_ptr< cuNDArray<float_complext::Type> > csm = estimate_b1_map<float,2>( &device_data );
+  
+  cudaThreadSynchronize(); cutStopTimer( timer );
+  time = cutGetTimerValue( timer ); printf("done: %.1f ms.", time ); fflush(stdout);
+
+  // Output result
+
+  hoNDArray<float_complext::Type> host_csm = csm->to_host();
+  write_nd_array<float_complext::Type>( host_csm, "csm.cplx" );
+
+  printf("\n", time ); fflush(stdout);
+
+  CHECK_FOR_CUDA_ERROR();
+  return 0;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu
new file mode 100644
index 0000000..af98ead
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.cu
@@ -0,0 +1,133 @@
+#include "cuCartesianSenseOperator.h"
+#include "cuNDFFT.h"
+
+#include <sstream>
+
+using namespace Gadgetron;
+
+template<class REAL> __global__ void 
+sample_array_kernel( complext<REAL> *in, complext<REAL> *out,
+		     unsigned int *idx, 
+		     unsigned int image_elements,
+		     unsigned int samples,
+		     unsigned int coils )
+{
+  unsigned int idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+  if (idx_in < samples) {
+    for (unsigned int i = 0; i < coils; i++) {
+      out[idx_in + i*samples].vec[0] += in[idx[idx_in] + i*image_elements].vec[0];
+      out[idx_in + i*samples].vec[1] += in[idx[idx_in] + i*image_elements].vec[1];
+    }
+  }
+}
+
+template<class REAL> __global__ void 
+insert_samples_kernel( complext<REAL> *in, complext<REAL> *out,
+		       unsigned int *idx, 
+		       unsigned int image_elements,
+		       unsigned int samples,
+		       unsigned int coils )
+{
+  unsigned int idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+  if (idx_in < samples) {
+    for (unsigned int i = 0; i < coils; i++) {
+      out[idx[idx_in] + i*image_elements].vec[0] += in[idx_in + i*samples].vec[0];
+      out[idx[idx_in] + i*image_elements].vec[1] += in[idx_in + i*samples].vec[1];
+    }
+  }
+}
+
+template<class REAL, unsigned int D> void
+cuCartesianSenseOperator<REAL,D>::mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate )
+{
+  if (!(in->dimensions_equal(this->get_domain_dimensions().get())) || !(out->dimensions_equal(this->get_codomain_dimensions().get())) ) {
+    throw std::runtime_error("cuCartesianSenseOperator::mult_M dimensions mismatch");
+  }
+  
+  std::vector<size_t> full_dimensions = *this->get_domain_dimensions();
+  full_dimensions.push_back(this->ncoils_);
+  cuNDArray< complext<REAL> > tmp(&full_dimensions);
+
+  this->mult_csm(in,&tmp);
+
+
+  std::vector<size_t> ft_dims;
+  for (unsigned int i = 0; i < this->get_domain_dimensions()->size(); i++) {
+    ft_dims.push_back(i);
+  }
+
+  cuNDFFT<REAL>::instance()->fft(&tmp, &ft_dims);
+
+  if (!accumulate) 
+    clear(out);
+
+  dim3 blockDim(512,1,1);
+  dim3 gridDim((unsigned int) std::ceil((double)idx_->get_number_of_elements()/blockDim.x), 1, 1 );
+  sample_array_kernel<REAL><<< gridDim, blockDim >>>( tmp.get_data_ptr(), out->get_data_ptr(), idx_->get_data_ptr(),
+						      in->get_number_of_elements(), idx_->get_number_of_elements(), this->ncoils_);
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    std::stringstream ss;
+    ss <<"cuCartesianSenseOperator::mult_M : Unable to sample data: " <<
+      cudaGetErrorString(err);
+    throw cuda_error(ss.str());
+  }
+}
+
+template<class REAL, unsigned int D> void
+cuCartesianSenseOperator<REAL,D>::mult_MH(cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate)
+{
+  if (!(out->dimensions_equal(this->get_domain_dimensions().get())) || 
+      !(in->dimensions_equal(this->get_codomain_dimensions().get())) ) {
+    throw std::runtime_error( "cuCartesianSenseOperator::mult_MH dimensions mismatch");
+
+  }
+
+  std::vector<size_t> tmp_dimensions = *this->get_domain_dimensions();
+  tmp_dimensions.push_back(this->ncoils_);
+
+  cuNDArray< complext<REAL> > tmp(&tmp_dimensions);
+  clear(&tmp);
+
+  dim3 blockDim(512,1,1);
+  dim3 gridDim((unsigned int) std::ceil((double)idx_->get_number_of_elements()/blockDim.x), 1, 1 );
+  insert_samples_kernel<REAL><<< gridDim, blockDim >>>( in->get_data_ptr(), tmp.get_data_ptr(),
+							idx_->get_data_ptr(),out->get_number_of_elements(),
+							idx_->get_number_of_elements(), this->ncoils_);
+  
+  cudaError_t err = cudaGetLastError();
+  if( err != cudaSuccess ){
+    std::stringstream ss;
+    ss << "cuCartesianSenseOperator::mult_EM : Unable to insert samples into array: " <<
+      cudaGetErrorString(err);
+    throw cuda_error(ss.str());
+  }
+
+
+  std::vector<size_t> ft_dims;
+  for (unsigned int i = 0; i < this->get_domain_dimensions()->size(); i++) {
+    ft_dims.push_back(i);
+  }
+
+  cuNDFFT<REAL>::instance()->ifft(&tmp, &ft_dims);
+
+  if (!accumulate) 
+    clear(out);
+  
+  this->mult_csm_conj_sum(&tmp,out);
+}
+
+//
+// Instantiations
+//
+
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,1>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,2>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,3>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<float,4>;
+
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,1>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,2>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,3>;
+template class EXPORTGPUPMRI cuCartesianSenseOperator<double,4>;
+
diff --git a/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h
new file mode 100644
index 0000000..51741b4
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuCartesianSenseOperator.h
@@ -0,0 +1,39 @@
+/** \file cuCartesianSenseOperator.h
+    \brief Cartesian Sense operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuSenseOperator.h"
+
+namespace Gadgetron{
+  
+  template<class REAL, unsigned int D> class EXPORTGPUPMRI cuCartesianSenseOperator : public cuSenseOperator<REAL,D>
+  {
+  public:
+    
+    cuCartesianSenseOperator() : cuSenseOperator<REAL,D>() {}
+    virtual ~cuCartesianSenseOperator() {}
+    
+    virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false);
+    virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false);
+    
+    virtual void set_sampling_indices( boost::shared_ptr< cuNDArray<unsigned int> > idx) 
+    {
+      if (idx.get()) {
+	idx_ = idx;
+	std::vector<size_t> tmp_dims;
+	tmp_dims.push_back(idx_->get_number_of_elements());
+	tmp_dims.push_back(this->ncoils_);
+	this->set_codomain_dimensions(&tmp_dims);
+      }
+    }
+    
+    virtual boost::shared_ptr< linearOperator<  cuNDArray< complext<REAL>  > > > clone(){
+      return linearOperator< cuNDArray<complext<REAL> > >::clone(this);
+    }
+    
+  protected:
+    boost::shared_ptr< cuNDArray<unsigned int> > idx_;
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu
new file mode 100644
index 0000000..72a98b9
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.cu
@@ -0,0 +1,41 @@
+#include "cuNonCartesianKtSenseOperator.h"
+#include "cuNDFFT.h"
+
+using namespace Gadgetron;
+
+template<class REAL, unsigned int D> void
+cuNonCartesianKtSenseOperator<REAL,D>::mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{
+  if( accumulate ){
+    throw std::runtime_error( "cuNonCartesianKtSenseOperator::mult_M: accumulation not supported");
+  }
+  
+  // Make a copy of the input array as the fft transform in-place and we do not want to alter the input
+  cuNDArray< complext<REAL> > tmp(*in); 
+  cuNDFFT<REAL>::instance()->fft( &tmp, D );
+  
+  cuNonCartesianSenseOperator<REAL,D>::mult_M( &tmp, out, accumulate );
+}
+
+template<class REAL, unsigned int D> void
+cuNonCartesianKtSenseOperator<REAL,D>::mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{  
+  if( accumulate ){
+    throw std::runtime_error( "cuNonCartesianKtSenseOperator::mult_MH: accumulation not supported");
+  }
+
+  cuNonCartesianSenseOperator<REAL,D>::mult_MH( in, out, accumulate );
+  cuNDFFT<REAL>::instance()->ifft( out, D );
+}
+
+//
+// Instantiations
+//
+
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<float,2>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<float,3>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<float,4>;
+
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<double,2>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<double,3>;
+template class EXPORTGPUPMRI cuNonCartesianKtSenseOperator<double,4>;
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.h b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.h
new file mode 100644
index 0000000..c6b29c0
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianKtSenseOperator.h
@@ -0,0 +1,30 @@
+/** \file cuNonCartesianKtSenseOperator.h
+    \brief Non-Cartesian kt-Sense operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNonCartesianSenseOperator.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D>
+  class EXPORTGPUPMRI cuNonCartesianKtSenseOperator : public cuNonCartesianSenseOperator<REAL,D>
+  {
+    
+  public:
+    
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+    
+    cuNonCartesianKtSenseOperator() : cuNonCartesianSenseOperator<REAL,D>() {}
+    virtual ~cuNonCartesianKtSenseOperator() {}
+    
+    virtual void mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+    virtual void mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+    
+    virtual boost::shared_ptr< linearOperator<cuNDArray< complext<REAL>  > > > clone(){
+      return linearOperator< cuNDArray<complext<REAL> > >::clone(this);
+    }  
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.cu
new file mode 100644
index 0000000..ee7b209
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.cu
@@ -0,0 +1,116 @@
+#include "cuNonCartesianSenseOperator.h"
+#include "vector_td_utilities.h"
+
+using namespace Gadgetron;
+
+/*
+static unsigned int prodv( std::vector<unsigned int> &vec )
+{
+  unsigned int result = 1;
+  for( unsigned int i=0; i<vec.size(); i++ ){
+    result *= vec[i];
+  }
+  return result;
+  }*/
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{
+  if( !in || !out ){
+    throw std::runtime_error("cuNonCartesianSenseOperator::mult_M : 0x0 input/output not accepted");
+  }
+  /*  
+  if( (in->get_number_of_elements() != prodv(*this->get_domain_dimensions())) ||
+      (out->get_number_of_elements() != prodv(*this->get_codomain_dimensions())) ) {
+    throw std::runtime_error("cuNonCartesianSenseOperator::mult_M: dimensions mismatch");
+    }*/
+
+  std::vector<size_t> full_dimensions = *this->get_domain_dimensions();
+  full_dimensions.push_back(this->ncoils_);
+  cuNDArray< complext<REAL> > tmp(&full_dimensions);  
+  this->mult_csm( in, &tmp );
+  
+  // Forwards NFFT
+
+  if( accumulate ){
+    cuNDArray< complext<REAL> > tmp_out(out->get_dimensions());
+    plan_->compute( &tmp, &tmp_out, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_FORWARDS_C2NC );
+    *out += tmp_out;
+  }
+  else
+    plan_->compute( &tmp, out, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_FORWARDS_C2NC );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate )
+{
+  if( !in || !out ){
+    throw std::runtime_error("cuNonCartesianSenseOperator::mult_MH : 0x0 input/output not accepted");
+  }
+  /*  
+  if( (out->get_number_of_elements() != prodv(*this->get_domain_dimensions())) ||
+      (in->get_number_of_elements() != prodv(*this->get_codomain_dimensions())) ) {
+    throw std::runtime_error("cuNonCartesianSenseOperator::mult_MH: dimensions mismatch");
+    }*/
+
+  std::vector<size_t> tmp_dimensions = *this->get_domain_dimensions();
+  tmp_dimensions.push_back(this->ncoils_);
+  cuNDArray< complext<REAL> > tmp(&tmp_dimensions);
+  
+  // Do the NFFT
+  plan_->compute( in, &tmp, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_BACKWARDS_NC2C );
+
+  if( !accumulate ){
+    clear(out);    
+  }
+  
+  this->mult_csm_conj_sum( &tmp, out );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W )
+{  
+  plan_->setup( matrix_size, matrix_size_os, W );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::preprocess( cuNDArray<_reald> *trajectory ) 
+{
+  if( trajectory == 0x0 ){
+    throw std::runtime_error( "cuNonCartesianSenseOperator: cannot preprocess 0x0 trajectory.");
+  }
+  
+  boost::shared_ptr< std::vector<size_t> > domain_dims = this->get_domain_dimensions();
+  if( domain_dims.get() == 0x0 || domain_dims->size() == 0 ){
+    throw std::runtime_error("cuNonCartesianSenseOperator::preprocess : operator domain dimensions not set");
+  }
+  plan_->preprocess( trajectory, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_PREP_ALL );
+  is_preprocessed_ = true;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+cuNonCartesianSenseOperator<REAL,D,ATOMICS>::set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw ) 
+{
+  dcw_ = dcw;  
+}
+
+//
+// Instantiations
+//
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,1,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,1,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,2,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,2,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,3,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,3,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,4,true>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<float,4,false>;
+
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,1,false>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,2,false>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,3,false>;
+template class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,4,false>;
diff --git a/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.h b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.h
new file mode 100644
index 0000000..ba0bd2f
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuNonCartesianSenseOperator.h
@@ -0,0 +1,50 @@
+/** \file cuNonCartesianSenseOperator.h
+    \brief Non-Cartesian Sense operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuSenseOperator.h"
+#include "cuNFFT.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS = false> class EXPORTGPUPMRI cuNonCartesianSenseOperator : public cuSenseOperator<REAL,D>
+  {
+  
+  public:
+  
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+
+    cuNonCartesianSenseOperator() : cuSenseOperator<REAL,D>() { 
+      plan_ = boost::shared_ptr< cuNFFT_plan<REAL, D, ATOMICS> >( new cuNFFT_plan<REAL, D, ATOMICS>() );
+      is_preprocessed_ = false;
+    }
+    
+    virtual ~cuNonCartesianSenseOperator() {}
+    
+    inline boost::shared_ptr< cuNFFT_plan<REAL, D, ATOMICS> > get_plan() { return plan_; }
+    inline boost::shared_ptr< cuNDArray<REAL> > get_dcw() { return dcw_; }
+    inline bool is_preprocessed() { return is_preprocessed_; } 
+
+    virtual void mult_M( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+    virtual void mult_MH( cuNDArray< complext<REAL> >* in, cuNDArray< complext<REAL> >* out, bool accumulate = false );
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W );
+    virtual void preprocess( cuNDArray<_reald> *trajectory );
+    virtual void set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw );
+
+    virtual boost::shared_ptr< linearOperator<cuNDArray< complext<REAL>  > > > clone(){
+      return linearOperator< cuNDArray<complext<REAL> > >::clone(this);
+    }
+  
+  protected:
+    boost::shared_ptr< cuNFFT_plan<REAL, D, ATOMICS> > plan_;
+    boost::shared_ptr< cuNDArray<REAL> > dcw_;
+    bool is_preprocessed_;
+  };
+  
+  //Atomics can't be used with doubles
+  template<unsigned int D> class EXPORTGPUPMRI cuNonCartesianSenseOperator<double,D,true>{};
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp b/toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp
new file mode 100644
index 0000000..b8a49cb
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBuffer.cpp
@@ -0,0 +1,224 @@
+#include "cuSenseBuffer.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray_utils.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  cuSenseBuffer<REAL,D,ATOMICS>::cuSenseBuffer() 
+  {
+    num_coils_ = 0;
+    cur_idx_ = cur_sub_idx_ = 0;
+    cycle_length_ = 0; sub_cycle_length_ = 0;
+    acc_buffer_empty_ = true;
+    Gadgetron::clear(matrix_size_);
+    Gadgetron::clear(matrix_size_os_);
+    W_ = REAL(0);
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBuffer<REAL,D,ATOMICS>::clear()
+  {
+    Gadgetron::clear(&acc_buffer_);
+    Gadgetron::clear(&cyc_buffer_);
+
+    cur_idx_ = cur_sub_idx_ = 0;
+    acc_buffer_empty_ = true;
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBuffer<REAL,D,ATOMICS>::
+  setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+	 unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles )
+  {      
+    bool matrix_size_changed = (matrix_size_ == matrix_size);
+    bool matrix_size_os_changed = (matrix_size_os_ == matrix_size_os);
+    bool kernel_changed = (W_ == W);
+    bool num_coils_changed = (num_coils_ == num_coils );
+    bool num_cycles_changed = (cycle_length_ == num_cycles+1);
+    bool is_virgin = (E_.get() == 0x0);
+    //bool num_sub_cycles_changed = (sub_cycle_length_ == num_sub_cycles);
+
+    matrix_size_ = matrix_size;
+    matrix_size_os_ = matrix_size_os;
+    W_ = W;
+    num_coils_ = num_coils;
+    cycle_length_ = num_cycles+1; // +1 as we need a "working buffer" in a addition to 'cycle_length' full ones
+    sub_cycle_length_ = num_sub_cycles;
+
+    std::vector<size_t> dims = to_std_vector(matrix_size_);
+    
+    if( is_virgin )
+      E_ = boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D,ATOMICS> >(new cuNonCartesianSenseOperator<REAL,D,ATOMICS>);
+    
+    if( is_virgin || matrix_size_changed || matrix_size_os_changed || kernel_changed ){
+      E_->set_domain_dimensions(&dims);
+      E_->setup( matrix_size_, matrix_size_os_, W );
+      nfft_plan_.setup( matrix_size_, matrix_size_os_, W );
+    }
+    
+    dims = to_std_vector(matrix_size_os_);    
+    dims.push_back(num_coils_);
+
+    if( acc_buffer_.get_number_of_elements() == 0 || matrix_size_os_changed || num_coils_changed ){
+      acc_buffer_.create(&dims);
+      Gadgetron::clear( &acc_buffer_ );
+    }
+
+    dims.push_back(cycle_length_);
+    if( cyc_buffer_.get_number_of_elements() == 0 || matrix_size_os_changed || num_coils_changed ){
+      cyc_buffer_.create(&dims);      
+      Gadgetron::clear( &cyc_buffer_);
+    }
+    else if( num_cycles_changed ){
+      // Reuse the old buffer content in this case...
+      // This happens automatically (in all cases?) with the current design?
+    }
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS> 
+  bool cuSenseBuffer<REAL,D,ATOMICS>::add_frame_data( cuNDArray<_complext> *samples, cuNDArray<_reald> *trajectory )
+  {
+    if( !samples || !trajectory ){
+      throw std::runtime_error("cuSenseBuffer::add_frame_data: illegal input pointer");
+    }
+
+    if( num_coils_ != samples->get_size(samples->get_number_of_dimensions()-1) ){
+      throw std::runtime_error("cuSenseBuffer::add_frame_data: unexpected number of coils according to setup");
+    }
+
+    if( dcw_.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBuffer::density compensation weights not set");
+    }
+    
+    // Make array containing the "current" buffer from the cyclic buffer
+    //
+
+    cuNDArray<_complext> cur_buffer(acc_buffer_.get_dimensions().get(),
+				    cyc_buffer_.get_data_ptr()+cur_idx_*acc_buffer_.get_number_of_elements());
+
+    // Preprocess frame
+    //
+
+    nfft_plan_.preprocess( trajectory, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_PREP_NC2C );
+    
+    // Convolve to form k-space frame (accumulation mode)
+    //
+    
+    nfft_plan_.convolve( samples, &cur_buffer, dcw_.get(), cuNFFT_plan<REAL,D,ATOMICS>::NFFT_CONV_NC2C, true );
+
+    // Update the accumulation buffer (if it is time...)
+    //
+
+    bool cycle_completed = false;
+
+    if( cur_sub_idx_ == sub_cycle_length_-1 ){
+
+      cycle_completed = true;
+      
+      // Buffer complete, add to accumulation buffer
+      //
+
+      acc_buffer_ += cur_buffer;
+      acc_buffer_empty_ = false;
+
+      // Start filling the next buffer in the cycle ...
+      //
+
+      cur_idx_++; 
+      if( cur_idx_ == cycle_length_ ) cur_idx_ = 0;
+
+      // ... but first subtract this next buffer from the accumulation buffer
+      //
+
+      cur_buffer.create( acc_buffer_.get_dimensions().get(), cyc_buffer_.get_data_ptr()+cur_idx_*acc_buffer_.get_number_of_elements() );
+      acc_buffer_ -= cur_buffer;
+
+      // Clear new buffer before refilling
+      //
+
+      Gadgetron::clear(&cur_buffer);
+    }
+
+    cur_sub_idx_++;
+    if( cur_sub_idx_ == sub_cycle_length_ ) cur_sub_idx_ = 0;
+
+    return cycle_completed;
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSenseBuffer<REAL,D,ATOMICS>::get_accumulated_coil_images()
+  {
+    std::vector<size_t> dims = to_std_vector(matrix_size_);
+    dims.push_back(num_coils_);
+
+    acc_image_ = boost::shared_ptr< cuNDArray<_complext> >( new cuNDArray<_complext>(&dims) );
+				    
+    // Check if we are ready to reconstruct. If not return an image of ones...
+    if( acc_buffer_empty_ ){
+      fill(acc_image_.get(),_complext(1));
+      return acc_image_;
+    }
+
+    // Finalize gridding of k-space CSM image (convolution has been done already)
+    //
+
+    // Copy accumulation buffer before in-place FFT
+    cuNDArray<_complext> acc_copy = acc_buffer_;
+
+    // FFT
+    nfft_plan_.fft( &acc_copy, cuNFFT_plan<REAL,D,ATOMICS>::NFFT_BACKWARDS );
+    
+    // Deapodize
+    nfft_plan_.deapodize( &acc_copy );
+    
+    // Remove oversampling
+    crop<_complext,D>( (matrix_size_os_-matrix_size_)>>1, &acc_copy, acc_image_.get() );
+    
+    //if( normalize ){
+    //REAL scale = REAL(1)/(((REAL)cycle_length_-REAL(1))*(REAL)sub_cycle_length_);
+    //*acc_image_ *= scale;
+    //}
+    
+    return acc_image_;
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSenseBuffer<REAL,D,ATOMICS>::get_combined_coil_image()
+  {
+    if( csm_.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBuffer::get_combined_coil_image: csm not set");
+    }
+
+    if( acc_image_.get() == 0x0 ){
+      if( get_accumulated_coil_images().get() == 0x0 ){ // This updates acc_image_
+	throw std::runtime_error("cuSenseBuffer::get_combined_coil_image: unable to acquire accumulated coil images");
+      }
+    }
+    
+    std::vector<size_t> dims = to_std_vector(matrix_size_);
+    boost::shared_ptr< cuNDArray<_complext> > image( new cuNDArray<_complext>(&dims) );
+
+    E_->set_csm(csm_);
+    E_->mult_csm_conj_sum( acc_image_.get(), image.get() );
+
+    return image;
+  }
+
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUPMRI cuSenseBuffer<float,2,true>;
+  template class EXPORTGPUPMRI cuSenseBuffer<float,2,false>;
+
+  template class EXPORTGPUPMRI cuSenseBuffer<float,3,true>;
+  template class EXPORTGPUPMRI cuSenseBuffer<float,3,false>;
+
+  template class EXPORTGPUPMRI cuSenseBuffer<float,4,true>;
+  template class EXPORTGPUPMRI cuSenseBuffer<float,4,false>;
+
+  template class EXPORTGPUPMRI cuSenseBuffer<double,2,false>;
+  template class EXPORTGPUPMRI cuSenseBuffer<double,3,false>;
+  template class EXPORTGPUPMRI cuSenseBuffer<double,4,false>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBuffer.h b/toolboxes/mri/pmri/gpu/cuSenseBuffer.h
new file mode 100644
index 0000000..aeffb99
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBuffer.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "cuNonCartesianSenseOperator.h"
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include "gpupmri_export.h"
+
+namespace Gadgetron{
+  
+  template<class REAL, unsigned int D, bool ATOMICS = false> class EXPORTGPUPMRI cuSenseBuffer
+  {
+  public:
+    
+    typedef complext<REAL> _complext;
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+
+    cuSenseBuffer();
+    virtual ~cuSenseBuffer() {}
+    
+    virtual void set_csm( boost::shared_ptr< cuNDArray<_complext> > csm ){
+      csm_ = csm;
+    }
+
+    virtual void set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw ){
+      dcw_ = dcw;
+    }
+    
+    inline REAL get_normalization_factor(){
+      return REAL(1)/(((REAL)cycle_length_-REAL(1))*(REAL)sub_cycle_length_);
+    }
+    
+    virtual void clear();
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+			unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles );
+
+    // Boolean return value indicates whether the accumulation buffer has changed (i.e. a cycle has been completed)
+    virtual bool add_frame_data( cuNDArray<_complext> *samples, cuNDArray<_reald> *trajectory ); 
+
+    virtual boost::shared_ptr< cuNDArray<_complext> > get_accumulated_coil_images();
+    virtual boost::shared_ptr< cuNDArray<_complext> > get_combined_coil_image();
+    
+  protected:
+    _uint64d matrix_size_, matrix_size_os_;
+    REAL W_;
+    unsigned int num_coils_;
+    unsigned int cycle_length_, sub_cycle_length_;
+    unsigned int cur_idx_, cur_sub_idx_;
+    bool acc_buffer_empty_;
+    cuNDArray<_complext> acc_buffer_, cyc_buffer_;
+    boost::shared_ptr< cuNDArray<_complext> > acc_image_;
+    boost::shared_ptr< cuNDArray<_complext> > csm_;
+    boost::shared_ptr< cuNDArray<REAL> > dcw_;
+    boost::shared_ptr< cuNonCartesianSenseOperator<REAL,D,ATOMICS> > E_;
+    cuNFFT_plan<REAL,D,ATOMICS> nfft_plan_;
+  };
+  
+  // To prevent the use of atomics with doubles.
+  template<unsigned int D> class EXPORTGPUPMRI cuSenseBuffer<double,D,true>{};
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp
new file mode 100644
index 0000000..46688c8
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.cpp
@@ -0,0 +1,89 @@
+#include "cuSenseBufferCg.h"
+#include "vector_td_utilities.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_reductions.h"
+#include "cuNDArray_elemwise.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBufferCg<REAL,D,ATOMICS>::
+  setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+	 unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles )
+  {      
+    cuSenseBuffer<REAL,D,ATOMICS>::setup( matrix_size, matrix_size_os, W, num_coils, num_cycles, num_sub_cycles );
+    
+    D_ = boost::shared_ptr< cuCgPreconditioner<_complext> >( new cuCgPreconditioner<_complext>() );
+    
+    cg_.set_encoding_operator( this->E_ );
+    cg_.set_preconditioner( D_ );    
+    cg_.set_max_iterations( 2 );
+    cg_.set_tc_tolerance( 1e-6 );
+    cg_.set_output_mode( cuCgSolver<_complext>::OUTPUT_SILENT);    
+  }
+  
+  template<class REAL, unsigned int D, bool ATOMICS>
+  void cuSenseBufferCg<REAL,D,ATOMICS>::preprocess( cuNDArray<_reald> *traj ) {
+    this->E_->preprocess(traj);
+    std::vector<size_t> dims = *traj->get_dimensions();
+    dims.push_back(this->num_coils_);
+    this->E_->set_codomain_dimensions(&dims);
+  }
+
+  template<class REAL, unsigned int D, bool ATOMICS>
+  boost::shared_ptr< cuNDArray<complext<REAL> > > cuSenseBufferCg<REAL,D,ATOMICS>::get_combined_coil_image()
+  {
+    // Some validity checks
+    //
+
+    if( this->csm_.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBufferCg::get_combined_coil_image: csm not set");
+    }
+
+    if( !this->E_->is_preprocessed() ){
+      throw std::runtime_error("cuSenseBufferCg::get_combined_coil_image: preprocessing not performed");
+    }
+    
+    // Compute (and scale) rhs
+    //
+
+    boost::shared_ptr< cuNDArray<_complext> > rhs = cuSenseBuffer<REAL,D,ATOMICS>::get_combined_coil_image();
+
+    if( rhs.get() == 0x0 ){
+      throw std::runtime_error("cuSenseBufferCg::get_combined_coil_image: failed to compute rhs");
+    }
+    
+    *rhs *= this->get_normalization_factor();
+
+    // Define preconditioning weights
+    //
+
+    boost::shared_ptr< cuNDArray<REAL> > _precon_weights = sum(abs_square(this->csm_.get()).get(), D);
+    reciprocal_sqrt_inplace(_precon_weights.get());	
+    boost::shared_ptr< cuNDArray<_complext> > precon_weights = real_to_complex<_complext>( _precon_weights.get() );
+    _precon_weights.reset();
+    D_->set_weights( precon_weights ); 
+
+    // Solve
+    //
+
+    return cg_.solve_from_rhs(rhs.get());
+  }
+  
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,2,true>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,2,false>;
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,3,true>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,3,false>;
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,4,true>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<float,4,false>;
+
+  template class EXPORTGPUPMRI cuSenseBufferCg<double,2,false>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<double,3,false>;
+  template class EXPORTGPUPMRI cuSenseBufferCg<double,4,false>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseBufferCg.h b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.h
new file mode 100644
index 0000000..2733815
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseBufferCg.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "cuSenseBuffer.h"
+#include "cuCgSolver.h"
+#include "cuCgPreconditioner.h"
+
+namespace Gadgetron{
+  
+  template<class REAL, unsigned int D, bool ATOMICS = false> 
+  class EXPORTGPUPMRI cuSenseBufferCg : public cuSenseBuffer<REAL,D,ATOMICS> 
+  {
+  public:
+    
+    typedef complext<REAL> _complext;
+    typedef typename uint64d<D>::Type _uint64d;
+    typedef typename reald<REAL,D>::Type _reald;
+
+    cuSenseBufferCg() : cuSenseBuffer<REAL,D,ATOMICS>() {}
+    virtual ~cuSenseBufferCg() {}
+
+    inline void set_dcw_for_rhs( boost::shared_ptr< cuNDArray<REAL> > dcw ){
+      this->E_->set_dcw(dcw);
+    }
+    
+    virtual void preprocess( cuNDArray<_reald> *traj );
+
+    virtual void setup( _uint64d matrix_size, _uint64d matrix_size_os, REAL W, 
+			unsigned int num_coils, unsigned int num_cycles, unsigned int num_sub_cycles );
+    
+    virtual boost::shared_ptr< cuNDArray<_complext> > get_combined_coil_image();
+    
+  protected:    
+    cuCgSolver<_complext> cg_;
+    boost::shared_ptr< cuCgPreconditioner<_complext> > D_;
+  };
+  
+  // To prevent the use of atomics with doubles.
+  template<unsigned int D> class EXPORTGPUPMRI cuSenseBufferCg<double,D,true>{};
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseOperator.cu b/toolboxes/mri/pmri/gpu/cuSenseOperator.cu
new file mode 100644
index 0000000..27a989d
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseOperator.cu
@@ -0,0 +1,32 @@
+#include "cuSenseOperator.h"
+#include "sense_utilities.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D> void
+  cuSenseOperator<REAL,D>::mult_csm( cuNDArray<complext<REAL> >* in, cuNDArray<complext<REAL> >* out )
+  {  
+    csm_mult_M<REAL,D>( in, out, this->csm_.get() );
+  }
+  
+  template<class REAL, unsigned int D> void
+  cuSenseOperator<REAL,D>::mult_csm_conj_sum( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out )
+  {
+    csm_mult_MH<REAL,D>( in, out, this->csm_.get() );
+  }
+  
+  //
+  // Instantiations
+  //
+  
+  template class EXPORTGPUPMRI cuSenseOperator<float,1>;
+  template class EXPORTGPUPMRI cuSenseOperator<float,2>;
+  template class EXPORTGPUPMRI cuSenseOperator<float,3>;
+  template class EXPORTGPUPMRI cuSenseOperator<float,4>;
+
+  template class EXPORTGPUPMRI cuSenseOperator<double,1>;
+  template class EXPORTGPUPMRI cuSenseOperator<double,2>;
+  template class EXPORTGPUPMRI cuSenseOperator<double,3>;
+  template class EXPORTGPUPMRI cuSenseOperator<double,4>;
+}
diff --git a/toolboxes/mri/pmri/gpu/cuSenseOperator.h b/toolboxes/mri/pmri/gpu/cuSenseOperator.h
new file mode 100644
index 0000000..34b13b5
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/cuSenseOperator.h
@@ -0,0 +1,31 @@
+/** \file cuSenseOperator.h
+    \brief Base class for the GPU based Sense operators
+*/
+
+#pragma once
+
+#include "senseOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "vector_td.h"
+#include "complext.h"
+#include "gpupmri_export.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D> class EXPORTGPUPMRI cuSenseOperator : public senseOperator< cuNDArray< complext<REAL> >, D >
+  {
+    
+  public:
+    
+    cuSenseOperator() : senseOperator<cuNDArray< complext<REAL> >,D >() {}
+    virtual ~cuSenseOperator() {}
+        
+    virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false ) = 0;
+    virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false ) = 0;
+    
+    virtual void mult_csm( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out );
+    virtual void mult_csm_conj_sum( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out );    
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/gpupmri_export.h b/toolboxes/mri/pmri/gpu/gpupmri_export.h
new file mode 100644
index 0000000..a66fa32
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/gpupmri_export.h
@@ -0,0 +1,19 @@
+/** \file gpupmri_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUPMRI_EXPORT_H_
+#define GPUPMRI_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUPMRI__) || defined (gpuparallelmri_EXPORTS)
+#define EXPORTGPUPMRI __declspec(dllexport)
+#else
+#define EXPORTGPUPMRI __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUPMRI
+#endif
+
+
+#endif /* GPUPMRI_EXPORT_H_ */
diff --git a/toolboxes/mri/pmri/gpu/htgrappa.cu b/toolboxes/mri/pmri/gpu/htgrappa.cu
new file mode 100644
index 0000000..a1bd753
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa.cu
@@ -0,0 +1,827 @@
+#include "htgrappa.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDFFT.h"
+#include "GadgetronTimer.h"
+#include "GPUTimer.h"
+
+#include "CUBLASContextProvider.h"
+
+#include <cublas_v2.h>
+#include <cula_lapack_device.h>
+#include <iostream>
+
+namespace Gadgetron {
+
+  int2 vec_to_int2(std::vector<unsigned int> vec)
+  {
+    int2 ret; ret.x = 0; ret.y = 0;
+    if (vec.size() < 2) {
+      std::cout << "vec_to_uint2 dimensions of vector too small" << std::endl;
+      return ret;
+    }
+
+    ret.x = vec[0]; ret.y = vec[1];
+    return ret;
+  }
+
+  __global__ void clear_array(complext<float> * in, unsigned long int elements)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+    if (idx_in < elements) {
+      in[idx_in] = complext<float>(0);
+    }
+  }
+
+  int clear(cuNDArray<complext<float> >* in)
+  {
+    dim3 blockDim(512,1,1);
+    dim3 gridDim((unsigned int) ceil((double)in->get_number_of_elements()/blockDim.x), 1, 1 );
+
+    clear_array<<< gridDim, blockDim >>>( in->get_data_ptr(), in->get_number_of_elements());
+
+    cudaError_t err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::cerr << "clear : Error during kernel call: " << cudaGetErrorString(err) << std::endl;
+      return -1;
+    }
+
+    return 0;
+  }
+
+  template <class T> int write_cuNDArray_to_disk(cuNDArray<T>* a, const char* filename)
+  {
+    boost::shared_ptr< hoNDArray<T> > host = a->to_host();
+    write_nd_array<complext<float> >(host.get(), filename);
+    return 0;
+  }
+
+  template <class T> __global__ void form_grappa_system_matrix_kernel_2d(T* ref_data,
+                                                                         int2 dims,
+                                                                         int source_coils,
+                                                                         int target_coils,
+                                                                         int2 ros,
+                                                                         int2 ros_offset,
+                                                                         int2 kernel_size,
+                                                                         int acceleration_factor,
+                                                                         int set_number,
+                                                                         T* out_matrix,
+                                                                         T* b)
+  {
+    long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+    int klocations = ros.x*ros.y;
+    int image_elements = dims.x*dims.y;
+    //int coefficients = kernel_size.x*kernel_size.y*coils;
+    if (idx_in < klocations) {
+      //unsigned int y = idx_in/ros.x;
+      //unsigned int x = idx_in - y*ros.x;
+      unsigned int x = idx_in/ros.y;
+      unsigned int y = idx_in - x*ros.y;
+      unsigned int idx_ref = 0;
+      unsigned int coeff_counter = 0;
+
+      int kernel_size_x = kernel_size.x;
+      int kernel_size_y = kernel_size.y;
+
+      for (int c = 0; c < source_coils; c++) {
+        for (int ky = -((kernel_size_y*acceleration_factor)>>1)+set_number+1;
+             ky < ((kernel_size_y*acceleration_factor+1)>>1); ky+=acceleration_factor) {
+          for (int kx = -(kernel_size_x>>1); kx < ((kernel_size_x+1)>>1); kx++) {
+            idx_ref = c*image_elements + x+kx+ros_offset.x + (y+ky+ros_offset.y)*dims.x;
+            //out_matrix[idx_in*coefficients+coeff_counter++] = ref_data[idx_ref];
+            out_matrix[idx_in+(coeff_counter++)*klocations] = ref_data[idx_ref];
+
+          }
+        }
+      }
+
+      //Loop over target coils here
+      for (unsigned int c = 0; c < target_coils; c++) {
+        //b[idx_in*coils + c] = ref_data[c*image_elements + y*dims.x+x];
+        b[idx_in + c*klocations] = ref_data[c*image_elements + (y+ros_offset.y)*dims.x+(x+ros_offset.x)];
+      }
+    }
+  }
+
+  //TODO: This should take source and target coils into consideration
+  template <class T> __global__ void copy_grappa_coefficients_to_kernel_2d(T* coeffs,
+                                                                           T* kernel,
+                                                                           int source_coils,
+                                                                           int target_coils,
+                                                                           int2 kernel_size,
+                                                                           int acceleration_factor,
+                                                                           int set)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    unsigned int coefficients_in_set = source_coils*kernel_size.x*kernel_size.y*target_coils;
+
+    if (idx_in < coefficients_in_set) {
+      int idx_in_tmp = idx_in;
+      int kx = idx_in%kernel_size.x;
+      idx_in = (idx_in-kx)/kernel_size.x;
+      int ky = idx_in%kernel_size.y;
+      idx_in = (idx_in-ky)/kernel_size.y;
+      int coil = idx_in%source_coils;
+      idx_in = (idx_in-coil)/source_coils;
+      int coilg = idx_in;
+
+      kernel[coilg*source_coils*(kernel_size.y*acceleration_factor)*kernel_size.x +
+             coil*(kernel_size.y*acceleration_factor)*kernel_size.x +
+             (ky*acceleration_factor + set + 1)*kernel_size.x + kx] = coeffs[idx_in_tmp];
+
+      if ((coil == coilg) && (kx == 0) && (ky == 0) && (set == 0)) {
+        kernel[coilg*source_coils*(kernel_size.y*acceleration_factor)*kernel_size.x +
+               coil*(kernel_size.y*acceleration_factor)*kernel_size.x +
+               ((kernel_size.y>>1)*acceleration_factor)*kernel_size.x + (kernel_size.x>>1) ].vec[0] = 1;
+
+      }
+    }
+  }
+
+  template <class T> __global__ void copy_grappa_kernel_to_kspace_2d(T* kernel,
+                                                                     T* out,
+                                                                     int2 dims,
+                                                                     int2 kernel_size,
+                                                                     int coils)
+  {
+
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < kernel_size.x*kernel_size.y*coils) {
+      int idx_in_tmp = idx_in;
+      int kx = idx_in%kernel_size.x;
+      idx_in = (idx_in-kx)/kernel_size.x;
+      int ky = idx_in%kernel_size.y;
+      idx_in = (idx_in-ky)/kernel_size.y;
+      int coil = idx_in;
+
+      int outx = -(kx- (kernel_size.x>>1)) + (dims.x>>1); //Flipping the kernel for conv
+      int outy = -(ky- (kernel_size.y>>1)) + (dims.y>>1);
+
+      out[coil*dims.x*dims.y + outy*dims.x + outx] = kernel[idx_in_tmp];
+    }
+  }
+
+  __global__ void scale_and_add_unmixing_coeffs(complext<float> * unmixing,
+                                                complext<float> * csm,
+                                                complext<float> * out,
+                                                int elements,
+                                                int coils,
+                                                float scale_factor)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    complext<float>  tmp;
+    if (idx_in < elements) {
+      for (int c = 0; c < coils; c++) {
+        tmp = unmixing[c*elements + idx_in]*conj(csm[idx_in]);
+        out[c*elements + idx_in] += scale_factor*tmp;
+
+      }
+    }
+  }
+
+  __global__ void scale_and_copy_unmixing_coeffs(complext<float> * unmixing,
+                                                 complext<float> * out,
+                                                 int elements,
+                                                 int coils,
+                                                 float scale_factor)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < elements) {
+      for (int c = 0; c < coils; c++) {
+        out[c*elements + idx_in] = scale_factor*unmixing[c*elements + idx_in];
+
+      }
+    }
+  }
+
+  __global__ void conj_csm_coeffs(complext<float> * csm,
+                                  complext<float> * out,
+                                  int source_elements,
+                                  int target_elements)
+  {
+    //TODO: Here we need to have both src_elements and target_elements and we use conj(csm) for all target_elements and 0.0 when element > target_elements
+
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < source_elements) {
+      if (idx_in >= target_elements) {
+        out[idx_in] = complext<float> (0.0,0.0);
+      } else {
+        out[idx_in] = conj(csm[idx_in]);
+      }
+    }
+  }
+
+  __global__ void single_channel_coeffs(complext<float> * out,
+                                        int channel_no,
+                                        int elements_per_channel)
+  {
+    unsigned long idx_in = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (idx_in < elements_per_channel) {
+      out[idx_in + channel_no*elements_per_channel] = complext<float>(1.0,0.0);
+    }
+  }
+
+
+  template <class T> int htgrappa_calculate_grappa_unmixing(cuNDArray<T>* ref_data,
+                                                            cuNDArray<T>* b1,
+                                                            unsigned int acceleration_factor,
+                                                            std::vector<unsigned int>* kernel_size,
+                                                            cuNDArray<T>* out_mixing_coeff,
+                                                            std::vector< std::pair<unsigned int, unsigned int> >* sampled_region,
+                                                            std::list< unsigned int >* uncombined_channels)
+  {
+
+    if (ref_data->get_number_of_dimensions() != b1->get_number_of_dimensions()) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: Dimensions mismatch" << std::endl;
+      return -1;
+    }
+
+    for (unsigned int i = 0; i < (ref_data->get_number_of_dimensions()-1); i++) {
+      if (ref_data->get_size(i) != b1->get_size(i)) {
+        std::cerr << "htgrappa_calculate_grappa_unmixing: Dimensions mismatch" << std::endl;
+        return -1;
+      }
+    }
+
+    unsigned int source_coils = ref_data->get_size(ref_data->get_number_of_dimensions()-1);
+    unsigned int target_coils = b1->get_size(b1->get_number_of_dimensions()-1);
+    unsigned int elements_per_coil = b1->get_number_of_elements()/target_coils;
+
+    if (target_coils > source_coils) {
+      std::cerr << "target_coils > source_coils" << std::endl;
+      return -1;
+    }
+
+    if (acceleration_factor == 1) {
+      dim3 blockDim(512,1,1);
+      dim3 gridDim((unsigned int) ceil((1.0f*elements_per_coil*source_coils)/blockDim.x), 1, 1 );
+
+      conj_csm_coeffs<<< gridDim, blockDim >>>( b1->get_data_ptr(),
+                                                out_mixing_coeff->get_data_ptr(),
+                                                out_mixing_coeff->get_number_of_elements(),
+                                                b1->get_number_of_elements());
+
+      std::list<unsigned int>::iterator it;
+      gridDim = dim3((unsigned int) ceil((1.0f*(elements_per_coil))/blockDim.x), 1, 1 );
+      int uncombined_channel_no = 0;
+      for ( it = uncombined_channels->begin(); it != uncombined_channels->end(); it++ ) {
+        uncombined_channel_no++;
+        //TODO: Adjust pointers to reflect that number of target/source may not be qual
+        single_channel_coeffs<<< gridDim, blockDim >>>( out_mixing_coeff->get_data_ptr() + uncombined_channel_no*source_coils*elements_per_coil,
+                                                        *it,
+                                                        (elements_per_coil));
+      }
+      return 0;
+    }
+
+    if (kernel_size->size() != (ref_data->get_number_of_dimensions()-1)) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: Kernel size does not match the data dimensions" << std::endl;
+      return -1;
+    }
+
+    if (ref_data->get_number_of_dimensions() > 3) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: Not yet implemented for 3D" << std::endl;
+      return -1;
+    }
+
+    //Calculate region of support + offsets
+    std::vector<size_t> rosTmp = *ref_data->get_dimensions();
+
+    std::vector<unsigned int> ros(rosTmp.size());
+    for ( unsigned int ii=0; ii<rosTmp.size(); ii++ )
+      {
+        ros[ii ] = rosTmp[ii];
+      }
+
+    ros.pop_back(); //Remove the number of coils
+    std::vector<unsigned int> ros_offset(ref_data->get_number_of_dimensions(),0);
+    unsigned long int kspace_locations = 1;
+
+    if (sampled_region) {
+      for (unsigned int i = 0; i < ros.size(); i++) {
+        if (i > 0) {
+          ros[i] = (*sampled_region)[i].second-(*sampled_region)[i].first-((*kernel_size)[i]*acceleration_factor);
+        } else {
+          ros[i] = (*sampled_region)[i].second-(*sampled_region)[i].first-(*kernel_size)[i];
+        }
+        ros_offset[i] = (*sampled_region)[i].first+(((*sampled_region)[i].second-(*sampled_region)[i].first-ros[i])>>1);
+        kspace_locations *= ros[i];
+      }
+    } else {
+      for (unsigned int i = 0; i < ros.size(); i++) {
+        if (i > 0) {
+          ros[i] -= ((*kernel_size)[i]*acceleration_factor);
+        } else {
+          ros[i] -= (*kernel_size)[i];
+        }
+        ros_offset[i] = (ref_data->get_size(i)-ros[i])>>1;
+        kspace_locations *= ros[i];
+      }
+    }
+
+    /*
+      for (unsigned int i = 0; i < ros.size(); i++) {
+      std::cout << "ROS[" << i << "] = " << ros[i] << " + " << ros_offset[i] << std::endl;
+      }
+    */
+
+    std::vector<size_t> sys_matrix_size;
+    sys_matrix_size.push_back(kspace_locations);
+    sys_matrix_size.push_back(source_coils*(*kernel_size)[0]*(*kernel_size)[1]);
+
+    std::vector<size_t> b_size;
+    b_size.push_back(kspace_locations);
+    b_size.push_back(target_coils);
+
+    cuNDArray<T> system_matrix = cuNDArray<T>(&sys_matrix_size);
+
+    clear(&system_matrix);
+
+    cuNDArray<T> b = cuNDArray<T>(&b_size);
+
+    boost::shared_ptr< std::vector<size_t> > dimTmp = ref_data->get_dimensions();
+    std::vector<unsigned int> dimInt(2, 0);
+    dimInt[0] = (*dimTmp)[0];
+    dimInt[1] = (*dimTmp)[1];
+
+    int2 dims = vec_to_int2(dimInt);
+    int2 dros = vec_to_int2(ros);
+    int2 dros_offset = vec_to_int2(ros_offset);
+    int2 dkernel_size = vec_to_int2(*kernel_size);
+
+    //TODO: Use source coils here
+    int n = source_coils*(*kernel_size)[0]*(*kernel_size)[1];
+    int m = kspace_locations;
+
+    std::vector<size_t> AHA_dims(2,n);
+    cuNDArray<T> AHA = cuNDArray<T>(&AHA_dims);
+    cuNDArray<T> AHA_set0 = cuNDArray<T>(&AHA_dims);
+
+    hoNDArray<T> AHA_host(n, n);
+    float2* pAHA = (float2*) AHA_host.get_data_ptr();
+
+    //TODO: Use target coils here
+    std::vector<size_t> AHrhs_dims;
+    AHrhs_dims.push_back(n);
+    AHrhs_dims.push_back(target_coils);
+
+    cuNDArray<T> AHrhs = cuNDArray<T>(&AHrhs_dims);
+
+    cublasHandle_t handle = *CUBLASContextProvider::instance()->getCublasHandle();
+    /*
+      if (cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS) {
+      std::cerr << "htgrappa_calculate_grappa_unmixing: unable to create cublas handle" << std::endl;
+      return -1;
+
+      }
+    */
+
+    std::vector<size_t> gkernel_dims;
+    gkernel_dims.push_back((*kernel_size)[0]);
+    gkernel_dims.push_back((*kernel_size)[1]*acceleration_factor);
+    gkernel_dims.push_back(source_coils);
+    gkernel_dims.push_back(target_coils);
+    cuNDArray<T> gkernel = cuNDArray<T>(&gkernel_dims);
+    clear(&gkernel);
+
+    //GadgetronTimer timer;
+
+    for (unsigned int set = 0; set < acceleration_factor-1; set++)
+      {
+        //std::cout << "Calculating coefficients for set " << set << std::endl;
+
+        //std::cout << "dros.x = " << dros.x << ", dros.y = " << dros.y << std::endl;
+
+        std::ostringstream ostr;
+        ostr << "Set_" << set << "_";
+        std::string appendix = ostr.str();
+
+        dim3 blockDim(512,1,1);
+        dim3 gridDim((unsigned int) ceil((1.0f*kspace_locations)/blockDim.x), 1, 1 );
+
+        form_grappa_system_matrix_kernel_2d<<< gridDim, blockDim >>>( ref_data->get_data_ptr(), dims,
+                                                                      source_coils, target_coils, dros, dros_offset,
+                                                                      dkernel_size, acceleration_factor, set,
+                                                                      system_matrix.get_data_ptr(),
+                                                                      b.get_data_ptr());
+
+        cudaError_t err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Unable to form system matrix: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"A.cplx";
+		    //write_cuNDArray_to_disk(&system_matrix, filename.c_str());
+        //  }
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"b.cplx";
+		    //write_cuNDArray_to_disk(&b, filename.c_str());
+        //  }
+
+        complext<float>  alpha = complext<float>(1);
+        complext<float>  beta = complext<float>(0);
+
+        cublasStatus_t stat;
+
+        if ( set == 0 )
+        {
+            {
+                //GPUTimer t2("Cgemm call");
+                stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                                                n,n,m,(float2*) &alpha,
+                                                (float2*) system_matrix.get_data_ptr(), m,
+                                                (float2*) system_matrix.get_data_ptr(), m,
+                                                (float2*) &beta, (float2*) AHA.get_data_ptr(), n);
+
+                if (stat != CUBLAS_STATUS_SUCCESS) {
+                    std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to form AHA product using cublas gemm" << std::endl;
+                    std::cerr << "---- cublas error code " << stat << std::endl;
+                    return -1;
+                }
+            }
+
+            {
+                //timer.start("copy AHA to host");
+                if (cudaMemcpy(pAHA, AHA.get_data_ptr(), AHA_host.get_number_of_bytes(), cudaMemcpyDeviceToHost) != cudaSuccess)
+                {
+                    std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to copy AHA to host" << std::endl;
+                    std::cerr << "---- cublas error code " << stat << std::endl;
+                    return -1;
+                }
+                //timer.stop();
+
+                //timer.start("apply the regularization");
+                // apply the regularization
+                double lamda = 0.0005;
+
+                double trA = std::sqrt(pAHA[0].x*pAHA[0].x + pAHA[0].y*pAHA[0].y);
+                size_t c;
+                for ( c=1; c<n; c++ )
+                {
+                    float x = pAHA[c+c*n].x;
+                    float y = pAHA[c+c*n].y;
+                    trA += std::sqrt(x*x+y*y);
+                }
+
+                double value = trA*lamda/n;
+                for ( c=0; c<n; c++ )
+                {
+                    float x = pAHA[c+c*n].x;
+                    float y = pAHA[c+c*n].y;
+                    pAHA[c+c*n].x = std::sqrt(x*x+y*y) + value;
+                    pAHA[c+c*n].y = 0;
+                }
+                //timer.stop();
+
+                //timer.start("copy the AHA to device");
+                if (cudaMemcpy(AHA.get_data_ptr(), pAHA, AHA_host.get_number_of_bytes(), cudaMemcpyHostToDevice) != cudaSuccess)
+                {
+                    std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to copy regularized AHA to device" << std::endl;
+                    std::cerr << "---- cublas error code " << stat << std::endl;
+                    return -1;
+                }
+                //timer.stop();
+            }
+
+            AHA_set0 = AHA;
+        }
+        else
+        {
+            AHA = AHA_set0;
+        }
+
+      //  {
+      //      std::string filename = debugFolder+appendix+"AHA.cplx";
+            //write_cuNDArray_to_disk(&AHA, filename.c_str());
+      //  }
+
+        {
+
+            //GPUTimer timer("GRAPPA cublas gemm");
+            //TODO: Sort out arguments for source and target coils here.
+            stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                    n,target_coils,m,(float2*) &alpha,
+                    (float2*) system_matrix.get_data_ptr(), m,
+                    (float2*) b.get_data_ptr(), m,
+                    (float2*) &beta, (float2*)AHrhs.get_data_ptr(), n);
+
+        }
+
+      //  {
+      //      std::string filename = debugFolder+appendix+"AHrhs.cplx";
+            //write_cuNDArray_to_disk(&AHrhs, filename.c_str());
+      //  }
+
+        if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to form AHrhs product using cublas gemm" << std::endl;
+            std::cerr << "---- cublas error code " << stat << std::endl;
+            return -1;
+        }
+
+
+        culaStatus s;
+        /*
+          s = culaInitialize();
+          if(s != culaNoError) {
+          std::cerr << "htgrappa: failed to initialize CULA" << std::endl;
+          return -1;
+          }
+        */
+
+        s = culaDeviceCgels( 'N', n, n, target_coils,
+                             (culaDeviceFloatComplex*)AHA.get_data_ptr(), n,
+                             (culaDeviceFloatComplex*)AHrhs.get_data_ptr(), n);
+
+
+        if (s != culaNoError) {
+          std::cout << "htgrappa_calculate_grappa_unmixing: linear solve failed" << std::endl;
+          return -1;
+        }
+
+#if 0
+        size_t free = 0, total = 0;
+        cudaMemGetInfo(&free, &total);
+        std::cout << "CUDA Memory: " << free << " (" << total << ")" << std::endl;
+#endif
+        //culaShutdown();
+
+        /*
+          if (cposv_wrapper(&AHA, &AHrhs) < 0) {
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Error calling cgels" << std::endl;
+          return -1;
+          }
+        */
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"AHrhs_solution.cplx";
+		    //write_cuNDArray_to_disk(&AHrhs, filename.c_str());
+        //  }
+
+        gridDim = dim3((unsigned int) ceil((1.0f*n*source_coils)/blockDim.x), 1, 1 );
+
+        //TODO: This should be target coils used as argument here.
+        copy_grappa_coefficients_to_kernel_2d<<< gridDim, blockDim >>>( AHrhs.get_data_ptr(),
+                                                                        gkernel.get_data_ptr(),
+                                                                        source_coils,
+                                                                        target_coils,
+                                                                        dkernel_size,
+                                                                        acceleration_factor,
+                                                                        set);
+
+        //  {
+        //      std::string filename = debugFolder+appendix+"kernel.cplx";
+		    //write_cuNDArray_to_disk(&gkernel, filename.c_str());
+        //  }
+
+        err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Failed to copy calculated coefficients to kernel: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+      }
+
+    //{
+    //    std::string filename = debugFolder+"kernel_all.cplx";
+    //    write_cuNDArray_to_disk(&gkernel, filename.c_str());
+    //}
+
+    //TODO: This should be source coils
+    cuNDArray<T> tmp_mixing = cuNDArray<T>(ref_data->get_dimensions());
+
+    int kernel_elements = gkernel.get_number_of_elements()/target_coils;
+    int total_elements = tmp_mixing.get_number_of_elements()/source_coils;
+    dkernel_size.y *= acceleration_factor;
+
+    std::vector<size_t> ft_dims(2,0);ft_dims[1] = 1;
+    clear(out_mixing_coeff);
+    unsigned int current_uncombined_index = 0;
+
+    //TODO: Loop over target coils.
+    for (unsigned int c = 0; c < target_coils; c++)
+      {
+        clear(&tmp_mixing);
+
+        dim3 blockDim(512,1,1);
+        dim3 gridDim((unsigned int) ceil((1.0f*kernel_elements)/blockDim.x), 1, 1 );
+
+        //TODO: Take source and target into consideration
+        copy_grappa_kernel_to_kspace_2d<<< gridDim, blockDim >>>((gkernel.get_data_ptr()+(c*kernel_elements)),
+                                                                 tmp_mixing.get_data_ptr(),
+                                                                 dims,
+                                                                 dkernel_size,
+                                                                 source_coils);
+
+        cudaError_t err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: Unable to pad GRAPPA kernel: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+        cuNDFFT<typename realType<T>::Type>::instance()->ifft(&tmp_mixing, &ft_dims);
+
+        float scale_factor = total_elements;
+
+        gridDim = dim3((unsigned int) ceil(1.0f*total_elements/blockDim.x), 1, 1 );
+        scale_and_add_unmixing_coeffs<<< gridDim, blockDim >>>(tmp_mixing.get_data_ptr(),
+                                                               (b1->get_data_ptr()+ c*total_elements),
+                                                               out_mixing_coeff->get_data_ptr(),
+                                                               total_elements,
+                                                               source_coils,
+                                                               scale_factor);
+        err = cudaGetLastError();
+        if( err != cudaSuccess ){
+          std::cerr << "htgrappa_calculate_grappa_unmixing: scale and add mixing coeffs: " <<
+            cudaGetErrorString(err) << std::endl;
+          return -1;
+        }
+
+        if (uncombined_channels) {
+          std::list<unsigned int>::iterator it = std::find((*uncombined_channels).begin(),(*uncombined_channels).end(),c);
+          if (it != (*uncombined_channels).end()) {
+            current_uncombined_index++;
+            scale_and_copy_unmixing_coeffs<<< gridDim, blockDim >>>(tmp_mixing.get_data_ptr(),
+                                                                    (out_mixing_coeff->get_data_ptr()+current_uncombined_index*total_elements*source_coils),
+                                                                    total_elements,
+                                                                    source_coils,
+                                                                    scale_factor);
+          }
+        }
+
+      }
+
+    //std::cout << "**********cublasDestroy()**************" << std::endl;
+    //cublasDestroy_v2(handle);
+
+    return 0;
+  }
+
+template <class T> int inverse_clib_matrix(cuNDArray<T>* A,
+                                cuNDArray<T>* b,
+                                cuNDArray<T>* coeff,
+                                double lamda)
+{
+    // A: M*N
+    // b: M*K
+    size_t M = A->get_size(0);
+    size_t N = A->get_size(1);
+
+    size_t K = b->get_size(1);
+
+    std::vector<size_t> AHA_dims(2,N);
+    cuNDArray<T> AHA = cuNDArray<T>(&AHA_dims);
+
+    std::vector<size_t> AHrhs_dims;
+    AHrhs_dims.push_back(N);
+    AHrhs_dims.push_back(K);
+
+    coeff->create(&AHrhs_dims);
+
+    cublasHandle_t handle = *CUBLASContextProvider::instance()->getCublasHandle();
+
+    complext<float>  alpha = complext<float>(1);
+    complext<float>  beta = complext<float>(0);
+
+    //{
+    //    std::string filename = debugFolder+"A.cplx";
+    //    write_cuNDArray_to_disk(A, filename.c_str());
+    //}
+
+    //{
+    //    std::string filename = debugFolder+"b.cplx";
+    //    write_cuNDArray_to_disk(b, filename.c_str());
+    //}
+
+    {
+        //GPUTimer t2("compute AHA ...");
+        cublasStatus_t stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                N,N,M,(float2*) &alpha,
+                (float2*) A->get_data_ptr(), M,
+                (float2*) A->get_data_ptr(), M,
+                (float2*) &beta, (float2*) AHA.get_data_ptr(), N);
+
+        if (stat != CUBLAS_STATUS_SUCCESS)
+        {
+            std::cerr << "inverse_clib_matrix: Failed to form AHA product using cublas gemm" << std::endl;
+            std::cerr << "---- cublas error code " << stat << std::endl;
+            return -1;
+        }
+    }
+
+    //{
+    //    std::string filename = debugFolder+"AHA.cplx";
+    //    write_cuNDArray_to_disk(&AHA, filename.c_str());
+    //}
+
+    {
+        //GPUTimer t2("compute AHrhs ...");
+        cublasStatus_t stat = cublasCgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N,
+                N,K,M,(float2*) &alpha,
+                (float2*) A->get_data_ptr(), M,
+                (float2*) b->get_data_ptr(), M,
+                (float2*) &beta, (float2*)coeff->get_data_ptr(), N);
+
+        if (stat != CUBLAS_STATUS_SUCCESS)
+        {
+            std::cerr << "inverse_clib_matrix: Failed to form AHrhs product using cublas gemm" << std::endl;
+            std::cerr << "---- cublas error code " << stat << std::endl;
+            return -1;
+        }
+    }
+
+    //{
+    //    std::string filename = debugFolder+"AHrhs.cplx";
+    //    write_cuNDArray_to_disk(coeff, filename.c_str());
+    //}
+
+    // apply the regularization
+    if ( lamda > 0 )
+    {
+        hoNDArray<T> AHA_host(N, N);
+        float2* pAHA = (float2*) AHA_host.get_data_ptr();
+
+        //GadgetronTimer timer;
+
+        //timer.start("copy AHA to host");
+        if (cudaMemcpy(pAHA, AHA.get_data_ptr(), AHA_host.get_number_of_bytes(), cudaMemcpyDeviceToHost) != cudaSuccess)
+        {
+            std::cerr << "inverse_clib_matrix: Failed to copy AHA to host" << std::endl;
+            return -1;
+        }
+        //timer.stop();
+
+        //timer.start("apply the regularization");
+        // apply the regularization
+        double trA = std::sqrt(pAHA[0].x*pAHA[0].x + pAHA[0].y*pAHA[0].y);
+        size_t c;
+        for ( c=1; c<N; c++ )
+        {
+            float x = pAHA[c+c*N].x;
+            float y = pAHA[c+c*N].y;
+            trA += std::sqrt(x*x+y*y);
+        }
+
+        double value = trA*lamda/N;
+        for ( c=0; c<N; c++ )
+        {
+            float x = pAHA[c+c*N].x;
+            float y = pAHA[c+c*N].y;
+            pAHA[c+c*N].x = std::sqrt(x*x+y*y) + value;
+            pAHA[c+c*N].y = 0;
+        }
+        //timer.stop();
+
+        //timer.start("copy the AHA to device");
+        if (cudaMemcpy(AHA.get_data_ptr(), pAHA, AHA_host.get_number_of_bytes(), cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            std::cerr << "inverse_clib_matrix: Failed to copy regularized AHA to device" << std::endl;
+            return -1;
+        }
+        //timer.stop();
+    }
+
+    culaStatus s;
+    s = culaDeviceCgels( 'N', N, N, K,
+            (culaDeviceFloatComplex*)AHA.get_data_ptr(), N,
+            (culaDeviceFloatComplex*)coeff->get_data_ptr(), N);
+
+
+    //{
+    //    std::string filename = debugFolder+"coeff.cplx";
+    //    write_cuNDArray_to_disk(coeff, filename.c_str());
+    //}
+
+    if (s != culaNoError)
+    {
+        std::cout << "inverse_clib_matrix: linear solve failed" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
+
+  //Template instanciation
+    template EXPORTGPUPMRI int htgrappa_calculate_grappa_unmixing(cuNDArray<complext<float> >* ref_data,
+                                                                cuNDArray<complext<float> >* b1,
+                                                                unsigned int acceleration_factor,
+                                                                std::vector<unsigned int> *kernel_size,
+                                                                cuNDArray<complext<float> >* out_mixing_coeff,
+                                                                std::vector< std::pair<unsigned int, unsigned int> >* sampled_region,
+                                                                std::list< unsigned int >* uncombined_channels);
+
+    template EXPORTGPUPMRI int inverse_clib_matrix(cuNDArray<complext<float> >* A,
+                                    cuNDArray<complext<float> >* b,
+                                    cuNDArray<complext<float> >* coeff,
+                                    double lamda);
+}
diff --git a/toolboxes/mri/pmri/gpu/htgrappa.h b/toolboxes/mri/pmri/gpu/htgrappa.h
new file mode 100644
index 0000000..7ffce77
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa.h
@@ -0,0 +1,29 @@
+#ifndef HTGRAPPA_H
+#define HTGRAPPA_H
+
+#include "gpupmri_export.h"
+#include "cuNDArray.h"
+
+#include <list>
+
+namespace Gadgetron
+{
+
+template <class T> EXPORTGPUPMRI 
+int htgrappa_calculate_grappa_unmixing(cuNDArray<T>* ref_data, 
+                                    cuNDArray<T>* b1,
+                                    unsigned int acceleration_factor,
+                                    std::vector<unsigned int>* kernel_size,
+                                    cuNDArray<T>* out_mixing_coeff,
+                                    std::vector< std::pair<unsigned int, unsigned int> >* sampled_region = 0, 
+                                    std::list< unsigned int >* uncombined_channels = 0);
+
+template <class T> EXPORTGPUPMRI 
+int inverse_clib_matrix(cuNDArray<T>* A, 
+                            cuNDArray<T>* b,
+                            cuNDArray<T>* out_mixing_coeff, 
+                            double lamda);
+
+}
+
+#endif //HTGRAPPA_H
diff --git a/toolboxes/mri/pmri/gpu/htgrappa_test.cpp b/toolboxes/mri/pmri/gpu/htgrappa_test.cpp
new file mode 100644
index 0000000..7b7f01b
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/htgrappa_test.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <memory>
+
+#include "cuNDArray.h"
+#include "hoNDArray_fileio.h"
+#include "cuNDFFT.h"
+#include "GPUTimer.h"
+#include "htgrappa.h"
+
+using namespace Gadgetron;
+int main(int argc, char** argv)
+{
+  std::cout << "Simple HTGRAPPA program" << std::endl;
+  {
+    GPUTimer init_time("CUDA Initialization");
+  }
+  GPUTimer total_time("Total time elapsed");
+  
+
+  GPUTimer* timer_ptr = new GPUTimer("Loading data");
+  hoNDArray<cuFloatComplex> time_average_host = 
+    read_nd_array<cuFloatComplex>("time_average.cplx");
+
+  hoNDArray<cuFloatComplex> b1_host = 
+    read_nd_array<cuFloatComplex>("b1.cplx");
+
+  cuNDArray<cuFloatComplex> time_average_dev(time_average_host);
+  cuNDArray<cuFloatComplex> b1_dev(b1_host);
+  delete timer_ptr;
+
+  cuNDArray<cuFloatComplex> unmixing_dev;
+  if (!unmixing_dev.create(b1_dev.get_dimensions())) {
+    std::cout << "Unable to allocate memory for GRAPPA unmixing coefficients" << std::endl;
+    return 0;
+  }
+
+  {
+    GPUTimer unmix_timer("GRAPPA Unmixing");
+    std::vector<unsigned int> kernel_size;
+    kernel_size.push_back(5);
+    kernel_size.push_back(4);
+    if ( htgrappa_calculate_grappa_unmixing(&time_average_dev, 
+					    &b1_dev,
+					    4,
+					    kernel_size,
+					    &unmixing_dev) < 0) {
+      std::cout << "Error calculating unmixing coefficients" << std::endl;
+    }
+  }
+
+  /*
+  std::auto_ptr< cuNDArray<float2> > b1 = 
+    estimate_b1_map<uint2, float, float2>(&time_average_dev);
+  */
+
+  timer_ptr = new GPUTimer("Saving data");
+  hoNDArray<cuFloatComplex> average_image = time_average_dev.to_host();
+  write_nd_array<cuFloatComplex>(average_image, "average_image.cplx");
+  delete timer_ptr;
+
+  std::cout << "Reconstruction done" << std::endl;
+
+  return 0;
+}
diff --git a/toolboxes/mri/pmri/gpu/senseOperator.h b/toolboxes/mri/pmri/gpu/senseOperator.h
new file mode 100644
index 0000000..66c2f2f
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/senseOperator.h
@@ -0,0 +1,48 @@
+/** \file senseOperator.h
+    \brief Base class for all Sense operators
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "gpupmri_export.h"
+
+#include <boost/smart_ptr.hpp>
+#include <iostream>
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE, unsigned int D> class EXPORTGPUPMRI senseOperator : public linearOperator<ARRAY_TYPE>
+  {
+
+  public:
+
+    senseOperator() : linearOperator<ARRAY_TYPE>(), ncoils_(0) {}
+    virtual ~senseOperator() {}
+
+    inline unsigned int get_number_of_coils() { return ncoils_; }
+    inline boost::shared_ptr<ARRAY_TYPE> get_csm() { return csm_; }
+    
+    virtual void set_csm( boost::shared_ptr<ARRAY_TYPE> csm )
+    {
+      if( csm.get() && csm->get_number_of_dimensions() == D+1 ) {
+	csm_ = csm;      
+	ncoils_ = csm_->get_size(D);
+      }
+      else{
+	throw std::runtime_error("Error: senseOperator::set_csm : unexpected csm dimensionality");
+      }    
+    }
+
+    virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false ) = 0;
+    virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false ) = 0;
+
+    virtual void mult_csm( ARRAY_TYPE* in, ARRAY_TYPE* out ) = 0;
+    virtual void mult_csm_conj_sum( ARRAY_TYPE* in, ARRAY_TYPE* out) = 0;
+
+  protected:
+
+    unsigned int ncoils_;
+    boost::shared_ptr< ARRAY_TYPE > csm_;
+  };
+}
diff --git a/toolboxes/mri/pmri/gpu/sense_utilities.cu b/toolboxes/mri/pmri/gpu/sense_utilities.cu
new file mode 100644
index 0000000..02d00be
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/sense_utilities.cu
@@ -0,0 +1,146 @@
+#include "sense_utilities.h"
+#include "vector_td_utilities.h"
+#include <sstream>
+
+namespace Gadgetron{
+
+  template<class REAL> __global__ void 
+  mult_csm_kernel( complext<REAL> *in, complext<REAL> *out, complext<REAL> *csm,
+		   size_t image_elements, unsigned int nframes, unsigned int ncoils )
+  {
+    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
+    if( idx < image_elements) {
+      complext<REAL> _in = in[idx+blockIdx.y*image_elements];
+      for( unsigned int i=0; i<ncoils; i++) {
+	out[idx + blockIdx.y*image_elements + i*image_elements*nframes] =  _in * csm[idx+i*image_elements];
+      }
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  csm_mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, cuNDArray< complext<REAL> > *csm )
+  {  
+    int device;
+    if( cudaGetDevice( &device ) != cudaSuccess ){
+      throw cuda_error( "mult_csm: unable to query current device");
+    }
+  
+    if( !in || in->get_device() != device || !out || out->get_device() != device || !csm || csm->get_device() != device ){
+      throw cuda_error("mult_csm: array not residing current device");
+    }
+  
+    if( in->get_number_of_dimensions() < D  || in->get_number_of_dimensions() > D+1 ){
+      throw std::runtime_error("mult_csm: unexpected input dimensionality");
+    }
+  
+    if( in->get_number_of_dimensions() > out->get_number_of_dimensions() ){
+      throw std::runtime_error("mult_csm: input dimensionality cannot exceed output dimensionality");
+    }
+
+    if( csm->get_number_of_dimensions() != D+1 ) {
+      throw std::runtime_error("mult_csm: input dimensionality of csm not as expected");
+    }
+
+    unsigned int num_image_elements = 1;
+    for( unsigned int d=0; d<D; d++ )
+      num_image_elements *= in->get_size(d);
+  
+    unsigned int num_frames = in->get_number_of_elements() / num_image_elements;
+  
+    dim3 blockDim(256);
+    dim3 gridDim((num_image_elements+blockDim.x-1)/blockDim.x, num_frames);
+
+    mult_csm_kernel<REAL><<< gridDim, blockDim >>>
+      ( in->get_data_ptr(), out->get_data_ptr(), csm->get_data_ptr(), num_image_elements, num_frames, csm->get_size(D) );
+
+    cudaError_t err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::stringstream ss;
+      ss << "mult_csm: unable to multiply with coil sensitivities: " <<
+	cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+
+    }
+  }
+
+  template <class REAL> __global__ void 
+  mult_csm_conj_sum_kernel( complext<REAL> *in, complext<REAL> *out, complext<REAL> *csm,
+			    size_t image_elements, unsigned int nframes, unsigned int ncoils )
+  {
+    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
+    if( idx < image_elements ) {
+      complext<REAL> _out =complext<REAL>(0);
+      for( unsigned int i = 0; i < ncoils; i++ ) {
+	_out += in[idx+blockIdx.y*image_elements+i*nframes*image_elements] * conj(csm[idx+i*image_elements]);
+      }
+      out[idx+blockIdx.y*image_elements] = _out;
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  csm_mult_MH( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, cuNDArray<complext<REAL> > *csm )
+  {
+    int device;
+    if( cudaGetDevice( &device ) != cudaSuccess ){
+      throw cuda_error("mult_csm_conj_sum: unable to query current device");
+    }
+  
+    if( !in || in->get_device() != device || !out || out->get_device() != device || !csm || csm->get_device() != device ){
+      throw std::runtime_error("mult_csm_conj_sum: array not residing current device");
+    }
+  
+    if( out->get_number_of_dimensions() < D  || out->get_number_of_dimensions() > D+1 ){
+      throw std::runtime_error("mult_csm_conj_sum: unexpected output dimensionality");
+    }
+
+    if( out->get_number_of_dimensions() > in->get_number_of_dimensions() ){
+      throw std::runtime_error("mult_csm_conj_sum: output dimensionality cannot exceed input dimensionality");
+    }
+
+    if( csm->get_number_of_dimensions() != D+1 ) {
+      throw std::runtime_error("mult_csm_conj_sum: input dimensionality of csm not as expected");
+    }
+
+    unsigned int num_image_elements = 1;
+    for( unsigned int d=0; d<D; d++ )
+      num_image_elements *= out->get_size(d);
+  
+    unsigned int num_frames = out->get_number_of_elements() / num_image_elements;
+
+    dim3 blockDim(256);
+    dim3 gridDim((num_image_elements+blockDim.x-1)/blockDim.x, num_frames);
+
+    mult_csm_conj_sum_kernel<REAL><<< gridDim, blockDim >>>
+      ( in->get_data_ptr(), out->get_data_ptr(), csm->get_data_ptr(), num_image_elements, num_frames, csm->get_size(D) );
+
+    cudaError_t err = cudaGetLastError();
+    if( err != cudaSuccess ){
+      std::stringstream ss;
+      ss << "mult_csm_conj_sum: unable to combine coils " <<
+	cudaGetErrorString(err);
+      throw cuda_error(ss.str());
+    }
+  }
+
+  // Instantiation
+
+  template EXPORTGPUPMRI void csm_mult_M<float,1>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_M<float,2>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_M<float,3>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_M<float,4>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+
+  template EXPORTGPUPMRI void csm_mult_M<double,1>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_M<double,2>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_M<double,3>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_M<double,4>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+
+  template EXPORTGPUPMRI void csm_mult_MH<float,1>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<float,2>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<float,3>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<float,4>( cuNDArray< complext<float> >*, cuNDArray< complext<float> >*, cuNDArray< complext<float> >*);
+
+  template EXPORTGPUPMRI void csm_mult_MH<double,1>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<double,2>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<double,3>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+  template EXPORTGPUPMRI void csm_mult_MH<double,4>( cuNDArray< complext<double> >*, cuNDArray< complext<double> >*, cuNDArray< complext<double> >*);
+}
diff --git a/toolboxes/mri/pmri/gpu/sense_utilities.h b/toolboxes/mri/pmri/gpu/sense_utilities.h
new file mode 100644
index 0000000..56e26f7
--- /dev/null
+++ b/toolboxes/mri/pmri/gpu/sense_utilities.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "cuNDArray.h"
+#include "complext.h"
+#include "gpupmri_export.h"
+
+namespace Gadgetron{
+
+// Multiply with coil sensitivities
+//
+
+template< class REAL, unsigned int D> EXPORTGPUPMRI void
+csm_mult_M( cuNDArray< complext<REAL> > *in, 
+	    cuNDArray< complext<REAL> > *out, 
+	    cuNDArray< complext<REAL> > *csm );
+
+
+// Multiply with adjoint of coil sensitivities
+//
+
+template< class REAL, unsigned int D> EXPORTGPUPMRI void
+csm_mult_MH( cuNDArray< complext<REAL> > *in, 
+	     cuNDArray< complext<REAL> > *out, 
+	     cuNDArray< complext<REAL> > *csm );
+}
diff --git a/toolboxes/nfft/CMakeLists.txt b/toolboxes/nfft/CMakeLists.txt
new file mode 100644
index 0000000..2244056
--- /dev/null
+++ b/toolboxes/nfft/CMakeLists.txt
@@ -0,0 +1,3 @@
+IF (CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/nfft/gpu/CMakeLists.txt b/toolboxes/nfft/gpu/CMakeLists.txt
new file mode 100644
index 0000000..332daed
--- /dev/null
+++ b/toolboxes/nfft/gpu/CMakeLists.txt
@@ -0,0 +1,43 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUNFFT__)
+ADD_DEFINITIONS(-D_USE_MATH_DEFINES)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories( 
+  ${CUDA_INCLUDE_DIRS}
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  )
+
+cuda_add_library(gpunfft SHARED 
+    cuNFFT.h
+    cuNFFTOperator.h
+    gpunfft_export.h
+    cuNFFT.cu 
+    cuNFFTOperator.cu
+  )
+
+target_link_libraries(
+  gpunfft 
+  gpucore 
+  ${Boost_LIBRARIES}
+  ${FFTW3_LIBRARIES} 
+  ${CUDA_LIBRARIES} 
+  ${CUDA_CUFFT_LIBRARIES} 
+  ${CUDA_CUBLAS_LIBRARIES}
+  )
+
+install(TARGETS gpunfft DESTINATION lib)
+
+install(FILES 
+  cuNFFT.h 
+  cuNFFTOperator.h 
+  gpunfft_export.h 
+  DESTINATION include)
diff --git a/toolboxes/nfft/gpu/KaiserBessel_kernel.cu b/toolboxes/nfft/gpu/KaiserBessel_kernel.cu
new file mode 100644
index 0000000..4ac3433
--- /dev/null
+++ b/toolboxes/nfft/gpu/KaiserBessel_kernel.cu
@@ -0,0 +1,127 @@
+//
+// Kaiser-Bessel convolution kernels
+//
+
+__inline__ __device__ double 
+bessi0(double x)
+{
+   double denominator;
+   double numerator;
+   double z;
+
+   if (x == 0.0) {
+      return 1.0;
+   } else {
+      z = x * x;
+      numerator = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* 
+                     (z* 0.210580722890567e-22  + 0.380715242345326e-19 ) +
+                         0.479440257548300e-16) + 0.435125971262668e-13 ) +
+                         0.300931127112960e-10) + 0.160224679395361e-7  ) +
+                         0.654858370096785e-5)  + 0.202591084143397e-2  ) +
+                         0.463076284721000e0)   + 0.754337328948189e2   ) +
+                         0.830792541809429e4)   + 0.571661130563785e6   ) +
+                         0.216415572361227e8)   + 0.356644482244025e9   ) +
+                         0.144048298227235e10);
+
+      denominator = (z*(z*(z-0.307646912682801e4)+
+                       0.347626332405882e7)-0.144048298227235e10);
+   }
+
+   return -numerator/denominator;
+}
+
+__inline__ __device__ float 
+bessi0(float x)
+{
+   float denominator;
+   float numerator;
+   float z;
+
+   if (x == 0.0f) {
+      return 1.0f;
+   } else {
+      z = x * x;
+      numerator = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* 
+                     (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
+                         0.479440257548300e-16f) + 0.435125971262668e-13f ) +
+                         0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
+                         0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
+                         0.463076284721000e0f)   + 0.754337328948189e2f   ) +
+                         0.830792541809429e4f)   + 0.571661130563785e6f   ) +
+                         0.216415572361227e8f)   + 0.356644482244025e9f   ) +
+                         0.144048298227235e10f);
+
+      denominator = (z*(z*(z-0.307646912682801e4f)+
+                       0.347626332405882e7f)-0.144048298227235e10f);
+   }
+
+   return -numerator/denominator;
+}
+
+
+// Kaiser Bessel according to Beatty et. al. IEEE TMI 2005;24(6):799-808.
+// There is a slight difference wrt Jackson's formulation, IEEE TMI 1991;10(3):473-478.
+
+__inline__ __device__ double
+KaiserBessel( double u, double matrix_size_os, double one_over_W, double beta )
+{
+  double _tmp = 2.0*u*one_over_W;
+  double tmp = _tmp*_tmp;
+  double arg = beta*std::sqrt(1.0-tmp);
+  double bessi = bessi0(arg);
+  double ret = matrix_size_os*bessi*one_over_W;
+  return ret;
+}
+
+__inline__ __device__ float
+KaiserBessel( float u, float matrix_size_os, float one_over_W, float beta )
+{
+  float _tmp = 2.0f*u*one_over_W;
+  float tmp = _tmp*_tmp;
+  float arg = beta*std::sqrt(1.0f-tmp);
+  float bessi = bessi0(arg);
+  float ret = matrix_size_os*bessi*one_over_W;
+  return ret;
+}
+
+//
+// Below the intended interface
+//
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,1> &u, const Gadgetron::vector_td<REAL,1> &matrix_size_os, 
+	      REAL one_over_W, typename reald<REAL,1>::Type beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  return phi_x;
+}
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,2> &u, const Gadgetron::vector_td<REAL,2> &matrix_size_os, 
+	      REAL one_over_W, typename reald<REAL,2>::Type beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  REAL phi_y = KaiserBessel( u.vec[1], matrix_size_os.vec[1], one_over_W, beta[1] );
+  return phi_x*phi_y;
+}
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,3> &u, const Gadgetron::vector_td<REAL,3> &matrix_size_os, 
+	      REAL one_over_W, typename reald<REAL,3>::Type beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  REAL phi_y = KaiserBessel( u.vec[1], matrix_size_os.vec[1], one_over_W, beta[1] );
+  REAL phi_z = KaiserBessel( u.vec[2], matrix_size_os.vec[2], one_over_W, beta[2] );
+  return phi_x*phi_y*phi_z;
+}
+
+template<class REAL> __inline__ __device__ REAL
+KaiserBessel( const Gadgetron::vector_td<REAL,4> &u, Gadgetron::vector_td<REAL,4> &matrix_size_os, 
+	      REAL one_over_W, typename reald<REAL,4>::Type beta )
+{
+  REAL phi_x = KaiserBessel( u.vec[0], matrix_size_os.vec[0], one_over_W, beta[0] );
+  REAL phi_y = KaiserBessel( u.vec[1], matrix_size_os.vec[1], one_over_W, beta[1] );
+  REAL phi_z = KaiserBessel( u.vec[2], matrix_size_os.vec[2], one_over_W, beta[2] );
+  REAL phi_w = KaiserBessel( u.vec[3], matrix_size_os.vec[3], one_over_W, beta[3] );
+  return phi_x*phi_y*phi_z*phi_w;
+}
diff --git a/toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu b/toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu
new file mode 100644
index 0000000..b34655b
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_C2NC_conv_kernel.cu
@@ -0,0 +1,249 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+*/
+
+//
+// There is no header file accompanying this kernel, so it makes most sense to read the code/file from the end and upwards
+//
+
+//
+// Transfer result from shared memory to global memory.
+//
+
+template<class REAL> __inline__ __device__ void 
+NFFT_output( unsigned int number_of_samples, unsigned int number_of_batches, complext<REAL> *samples,
+	     unsigned int double_warp_size_power, unsigned int globalThreadId, unsigned int sharedMemFirstSampleIdx, bool accumulate )
+{
+  
+  REAL *shared_mem = (REAL*) _shared_mem;
+  
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+    complext<REAL>sample_value;
+    sample_value.vec[0] = shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)];
+    sample_value.vec[1] = shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)+warpSize];
+
+    unsigned int out_idx = (batch*gridDim.y+blockIdx.y)*number_of_samples + globalThreadId;
+
+    if( accumulate ) sample_value += samples[out_idx];
+    samples[out_idx] = sample_value;
+  }
+}
+
+template<unsigned int D> __inline__ __device__ static void
+resolve_wrap( vector_td<int,D> &grid_position, vector_td<unsigned int,D> &matrix_size_os )
+{
+  vector_td<int,D> zero(0);
+  grid_position += vector_less(grid_position, zero)*matrix_size_os;
+  grid_position -= vector_greater_equal(grid_position, matrix_size_os)* matrix_size_os;
+}
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_iterate_body( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+		   vector_td<unsigned int, D> matrix_size_os, unsigned int number_of_batches, complext<REAL> *image,
+		   unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+		   vector_td<REAL,D> sample_position, vector_td<int,D> grid_position )
+{
+      
+  // Calculate the distance between current sample and the grid cell
+  vector_td<REAL,D> grid_position_real = vector_td<REAL,D>(grid_position);
+  const vector_td<REAL,D> delta = abs(sample_position-grid_position_real);
+  const vector_td<REAL,D> half_W_vec(half_W );
+  
+  // If cell too distant from sample then move on to the next cell
+  if( weak_greater( delta, half_W_vec ))
+    return;
+
+  // Compute convolution weight.
+  const REAL weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+
+  // Safety measure. We have occationally observed a NaN from the KaiserBessel computation
+  if( !isfinite(weight) )
+    return;
+
+  // Resolve wrapping of grid position
+  resolve_wrap<D>( grid_position, matrix_size_os);
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+  
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+    
+    // Read the grid cell value from global memory
+    const complext<REAL> grid_value = 
+      image[ (batch*gridDim.y+blockIdx.y)*prod(matrix_size_os) + co_to_idx<D>( vector_td<unsigned int, D>(grid_position), matrix_size_os ) ];
+    
+    // Add 'weight*grid_value' to the samples in shared memory
+    shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)] += (weight*grid_value.vec[0]);
+    shared_mem[sharedMemFirstSampleIdx+(batch<<double_warp_size_power)+warpSize] += (weight*grid_value.vec[1]);
+  }
+}
+
+//
+// This method is deliberately overloaded in 'UINTd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,1>::Type alpha, typename reald<REAL,1>::Type beta, REAL W, 
+	      vector_td<unsigned int,1> matrix_size_os, unsigned int number_of_batches, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,1> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,1> sample_position, vector_td<int,1> lower_limit, vector_td<int,1> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+    
+    const intd<1>::Type grid_position(x);
+    
+    NFFT_iterate_body<REAL,1>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+			       one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+  }
+}
+
+//
+// This method is deliberately overloaded in 'UINTd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,2>::Type alpha, typename reald<REAL,2>::Type beta, REAL W, 
+	      vector_td<unsigned int,2> matrix_size_os, unsigned int number_of_batches, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,2> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,2> sample_position, vector_td<int,2> lower_limit, vector_td<int,2> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+    for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+      
+      const intd<2>::Type grid_position(x,y);
+      
+      NFFT_iterate_body<REAL,2>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+				 one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,3>::Type alpha, typename reald<REAL,3>::Type beta, REAL W, 
+	      vector_td<unsigned int,3> matrix_size_os, unsigned int number_of_batches, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,3> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,3> sample_position, vector_td<int,3> lower_limit, vector_td<int,3> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+    for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+      for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	
+	const intd<3>::Type grid_position(x,y,z);
+	
+	NFFT_iterate_body<REAL,3>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+				   one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+      }
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,4>::Type alpha, typename reald<REAL,4>::Type beta, REAL W, 
+	      vector_td<unsigned int,4> matrix_size_os, unsigned int number_of_batches, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,4> matrix_size_os_real, unsigned int sharedMemFirstSampleIdx,
+	      vector_td<REAL,4> sample_position, vector_td<int,4> lower_limit, vector_td<int,4> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int w = lower_limit.vec[3]; w<=upper_limit.vec[3]; w++ ){
+    for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+      for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+	for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	  
+	  const intd<4>::Type grid_position(x,y,z,w);
+	  
+	  NFFT_iterate_body<REAL,4>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, half_W, 
+				     one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, grid_position );
+	}
+      }
+    }
+  }
+}
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_convolve( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+	       vector_td<unsigned int, D> matrix_size_os, vector_td<unsigned int, D> matrix_size_wrap, 
+	       unsigned int number_of_samples, unsigned int number_of_batches, vector_td<REAL,D> *traj_positions, complext<REAL> *image,
+	       unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real,
+	       unsigned int globalThreadId, unsigned int sharedMemFirstSampleIdx )
+{
+  
+  // Sample position to convolve onto
+  // Computed in preprocessing, which included a wrap zone. Remove this wrapping.
+  const vector_td<REAL,D> half_wrap_real = vector_td<REAL,D>(matrix_size_wrap>>1);
+  const vector_td<REAL,D> sample_position = traj_positions[globalThreadId+blockIdx.y*number_of_samples]-half_wrap_real;
+  
+  // Half the kernel width
+  const vector_td<REAL,D> half_W_vec( half_W );
+  
+  // Limits of the subgrid to consider
+  const vector_td<int,D> lower_limit = vector_td<int,D>( ceil(sample_position-half_W_vec));
+  const vector_td<int,D> upper_limit = vector_td<int,D>( floor(sample_position+half_W_vec));
+
+  // Accumulate contributions from the grid
+  NFFT_iterate<REAL>( alpha, beta, W, matrix_size_os, number_of_batches, image, double_warp_size_power, 
+		      half_W, one_over_W, matrix_size_os_real, sharedMemFirstSampleIdx, sample_position, lower_limit, upper_limit );
+}
+
+//
+// kernel main
+//
+
+template<class REAL, unsigned int D> __global__ void
+NFFT_convolve_kernel( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+		      vector_td<unsigned int, D> matrix_size_os, vector_td<unsigned int, D> matrix_size_wrap,
+		      unsigned int number_of_samples, unsigned int number_of_batches, 
+		      vector_td<REAL,D> *traj_positions, complext<REAL> *image, complext<REAL> *samples,
+		      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, bool accumulate, vector_td<REAL,D> matrix_size_os_real )
+{
+
+  // Global thread number	
+  const unsigned int globalThreadId = (blockIdx.x*blockDim.x+threadIdx.x);
+
+  // Check if we are within bounds
+  if( globalThreadId >= number_of_samples )
+    return;
+  
+  // Number of reals to compute/output per thread
+  const unsigned int num_reals = number_of_batches<<1;
+  
+  // All shared memory reals corresponding to domain 'threadIdx.x' are located in bank threadIdx.x%warp_size to limit bank conflicts
+  const unsigned int scatterSharedMemStart = (threadIdx.x/warpSize)*warpSize;
+  const unsigned int scatterSharedMemStartOffset = threadIdx.x&(warpSize-1); // a faster way of saying (threadIdx.x%warpSize) 
+  const unsigned int sharedMemFirstSampleIdx = scatterSharedMemStart*num_reals + scatterSharedMemStartOffset;
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+  const REAL zero = REAL(0);
+
+  // Initialize shared memory
+  for( unsigned int i=0; i<num_reals; i++ )
+    shared_mem[sharedMemFirstSampleIdx+warpSize*i] = zero;
+  
+  // Compute NFFT using arbitrary sample trajectories
+  NFFT_convolve<REAL,D>( alpha, beta, W, matrix_size_os, matrix_size_wrap, number_of_samples, number_of_batches, 
+			 traj_positions, image, double_warp_size_power, half_W, one_over_W, 
+			 matrix_size_os_real, globalThreadId, sharedMemFirstSampleIdx );
+  
+  // Output k-space image to global memory
+  NFFT_output<REAL>( number_of_samples, number_of_batches, samples, double_warp_size_power, globalThreadId, sharedMemFirstSampleIdx, accumulate );
+}
+
diff --git a/toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu b/toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu
new file mode 100644
index 0000000..d1b49de
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_NC2C_atomic_conv_kernel.cu
@@ -0,0 +1,227 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+
+  Notice:
+  This version of the code uses atomic writes and thus differs from the two references above.
+*/
+
+//
+// There is no header file accompanying this kernel, so it makes most sense to read the code/file from the end and upwards
+//
+
+//
+// First the implementation of the inner-most loop
+// 
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_iterate_body( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, 
+		   REAL W, vector_td<unsigned int, D> matrix_size_os, 
+		   unsigned int number_of_batches, complext<REAL> *samples, complext<REAL> *image,
+		   unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real, 
+		   unsigned int frame, unsigned int num_frames,
+		   unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+		   vector_td<REAL,D> sample_position, vector_td<int,D> grid_position )
+{
+  // Calculate the distance between current sample and the grid cell
+  vector_td<REAL,D> grid_position_real = vector_td<REAL,D>(grid_position);
+  const vector_td<REAL,D> delta = abs(sample_position-grid_position_real);
+  const vector_td<REAL,D> half_W_vec(half_W );
+  
+  // If cell too distant from sample then move on to the next cell
+  if( weak_greater( delta, half_W_vec ))
+    return;
+
+  // Compute convolution weight.
+  const REAL weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+  
+  // Safety measure. We have occationally observed a NaN from the KaiserBessel computation
+  if( !isfinite(weight) )
+    return;
+
+  // Resolve wrapping of grid position
+  resolve_wrap<D>( grid_position, matrix_size_os );
+
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+
+    // Read the grid sample value from global memory
+    complext<REAL> sample_value = samples[sample_idx_in_batch+batch*num_samples_per_batch];
+    
+    // Determine the grid cell idx
+    unsigned int grid_idx = 
+      (batch*num_frames+frame)*prod(matrix_size_os) + co_to_idx<D>( vector_td<unsigned int, D>(grid_position), matrix_size_os );
+
+    // Atomic update of real and imaginary component
+    atomicAdd( &(((REAL*)image)[(grid_idx<<1)+0]), weight*real(sample_value) );
+    atomicAdd( &(((REAL*)image)[(grid_idx<<1)+1]), weight*imag(sample_value) );
+  }
+}
+
+//
+// This method is deliberately overloaded in 'UINTd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,1>::Type alpha, typename reald<REAL,1>::Type beta, 
+	      REAL W, vector_td<unsigned int,1> matrix_size_os, 
+	      unsigned int number_of_batches, complext<REAL> *samples, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, 
+	      vector_td<REAL,1> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,1> sample_position, vector_td<int,1> lower_limit, vector_td<int,1> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+    
+    const intd<1>::Type grid_position(x);
+    
+    NFFT_iterate_body<REAL,1>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+			       half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+			       num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,2>::Type alpha, typename reald<REAL,2>::Type beta, 
+	      REAL W, vector_td<unsigned int,2> matrix_size_os, 
+	      unsigned int number_of_batches, complext<REAL> *samples, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, 
+	      vector_td<REAL,2> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,2> sample_position, vector_td<int,2> lower_limit, vector_td<int,2> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+    for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+      
+      const intd<2>::Type grid_position(x,y);
+      
+      NFFT_iterate_body<REAL,2>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+				 half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+				 num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,3>::Type alpha, typename reald<REAL,3>::Type beta, 
+	      REAL W, vector_td<unsigned int,3> matrix_size_os, 
+	      unsigned int number_of_batches, complext<REAL> *samples, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, 
+	      vector_td<REAL,3> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 	      
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,3> sample_position, vector_td<int,3> lower_limit, vector_td<int,3> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+    for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+      for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	
+	const intd<3>::Type grid_position(x,y,z);
+	
+	NFFT_iterate_body<REAL,3>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+				   half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+				   num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+      }
+    }
+  }
+}
+
+//
+// This method is deliberately overloaded in 'd' (rather than templetized) to improve performance of the loop iteration
+//
+
+template<class REAL> __inline__ __device__ void
+NFFT_iterate( typename reald<REAL,4>::Type alpha, typename reald<REAL,4>::Type beta, 
+	      REAL W, vector_td<unsigned int,4> matrix_size_os, 
+	      unsigned int number_of_batches, complext<REAL> *samples, complext<REAL> *image,
+	      unsigned int double_warp_size_power, REAL half_W, REAL one_over_W,
+	      vector_td<REAL,4> matrix_size_os_real, 
+	      unsigned int frame, unsigned int num_frames, 
+	      unsigned int num_samples_per_batch, unsigned int sample_idx_in_batch, 
+	      vector_td<REAL,4> sample_position, vector_td<int,4> lower_limit, vector_td<int,4> upper_limit )
+{
+  // Iterate through all grid cells influencing the corresponding sample
+  for( int w = lower_limit.vec[3]; w<=upper_limit.vec[3]; w++ ){
+    for( int z = lower_limit.vec[2]; z<=upper_limit.vec[2]; z++ ){
+      for( int y = lower_limit.vec[1]; y<=upper_limit.vec[1]; y++ ){
+	for( int x = lower_limit.vec[0]; x<=upper_limit.vec[0]; x++ ){
+	  
+	  const intd<4>::Type grid_position(x,y,z,w);
+	  
+	  NFFT_iterate_body<REAL,4>( alpha, beta, W, matrix_size_os, number_of_batches, samples, image, double_warp_size_power, 
+				     half_W, one_over_W, matrix_size_os_real, frame, num_frames,
+				     num_samples_per_batch, sample_idx_in_batch, sample_position, grid_position );
+	}
+      }
+    }
+  }
+}
+
+//
+// kernel main
+//
+
+template<class REAL, unsigned int D> __global__ void
+NFFT_H_atomic_convolve_kernel( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+			       vector_td<unsigned int, D> matrix_size_os, vector_td<unsigned int, D> matrix_size_wrap,
+			       unsigned int num_samples_per_frame, unsigned int num_batches, 
+			       vector_td<REAL,D> *traj_positions, complext<REAL> *samples, complext<REAL> *image,
+			       unsigned int double_warp_size_power, REAL half_W, REAL one_over_W,
+			       vector_td<REAL,D> matrix_size_os_real )
+{
+  
+  // A runtime check will prevent this kernel from being run for compute models 1.x.
+  //
+  
+#if(__CUDA_ARCH__>=200)
+    
+  const unsigned int sample_idx_in_frame = (blockIdx.x*blockDim.x+threadIdx.x);
+
+  // Check if we are within bounds
+  if( sample_idx_in_frame >= num_samples_per_frame )
+    return;
+      
+  const unsigned int frame = blockIdx.y;
+  const unsigned int num_frames = gridDim.y;
+  const unsigned int num_samples_per_batch = num_samples_per_frame*num_frames ;
+  const unsigned int sample_idx_in_batch = sample_idx_in_frame+frame*num_samples_per_frame;
+  
+  // Sample position computed in preprocessing includes a wrap zone. Remove this wrapping.
+  const vector_td<REAL,D> half_wrap_real = vector_td<REAL,D>(matrix_size_wrap>>1);
+  const vector_td<REAL,D> sample_position = traj_positions[sample_idx_in_batch]-half_wrap_real;
+  
+  // Half the kernel width
+  const vector_td<REAL,D> half_W_vec = vector_td<REAL,D>( half_W );
+  
+  // Limits of the subgrid to consider
+  const vector_td<int,D> lower_limit = vector_td<int,D>( ceil(sample_position-half_W_vec));
+  const vector_td<int,D> upper_limit = vector_td<int,D>( floor(sample_position+half_W_vec));
+
+  // Output to the grid
+  NFFT_iterate<REAL>( alpha, beta, W, matrix_size_os, num_batches, samples, image, double_warp_size_power, 
+		      half_W, one_over_W, matrix_size_os_real, 
+		      frame, num_frames, num_samples_per_batch, sample_idx_in_batch, 
+		      sample_position, lower_limit, upper_limit );
+#endif
+}
diff --git a/toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu b/toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu
new file mode 100644
index 0000000..f3164df
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_NC2C_conv_kernel.cu
@@ -0,0 +1,140 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+*/
+
+//
+// There is no header file accompanying this kernel, so it makes most sense to read the code/file from the end and upwards
+//
+
+//
+// Transfer result from shared memory to global memory.
+//
+
+template<class REAL> __inline__ __device__ void 
+NFFT_H_output( unsigned int number_of_batches, complext<REAL>*image,
+	       unsigned int double_warp_size_power, unsigned int number_of_domains, 
+	       unsigned int globalThreadId, unsigned int sharedMemFirstCellIdx )
+{
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+  
+  for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+    complext<REAL>cell_coefficient;
+    cell_coefficient.vec[0] = shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)];
+    cell_coefficient.vec[1] = shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)+warpSize];
+    image[(batch*gridDim.y+blockIdx.y)*number_of_domains+globalThreadId] = cell_coefficient;
+  }
+}
+
+
+template<class REAL, unsigned int D> __inline__ __device__ void
+NFFT_H_convolve( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W, 
+		 unsigned int number_of_samples, unsigned int number_of_batches, unsigned int number_of_domains,
+		 vector_td<REAL,D> *traj_positions, complext<REAL>*samples, unsigned int *tuples_last, unsigned int *bucket_begin, unsigned int *bucket_end,
+		 unsigned int double_warp_size_power, REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real, 
+		 unsigned int globalThreadId, vector_td<unsigned int,D> domainPos, unsigned int sharedMemFirstCellIdx )
+{
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+
+  // Cell position as reald
+  vector_td<REAL,D> cell_pos = vector_td<REAL,D>( domainPos );
+  
+  // Convolve samples onto the domain (shared memory)
+  const unsigned int frame_offset = blockIdx.y*number_of_domains;
+  for( unsigned int i=bucket_begin[globalThreadId+frame_offset]; i<bucket_end[globalThreadId+frame_offset]; i++ )
+    {
+      // Safety precaution TODO
+      unsigned int sampleIdx = tuples_last[i];
+
+      // Safety precaution TODO
+      vector_td<REAL,D> sample_pos = traj_positions[sampleIdx];
+      
+      // Calculate the distance between the cell and the sample
+      vector_td<REAL,D> delta = abs(sample_pos-cell_pos);
+      vector_td<REAL,D> half_W_vec( half_W );
+  
+      // Check if sample will contribute
+      if( weak_greater(delta, half_W_vec ))
+	continue;
+      
+      // Compute convolution weights
+      float weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+      
+      // Safety measure
+      if( !isfinite(weight) )
+      	continue;
+      
+      // Apply Kaiser-Bessel filter to input images
+      for( unsigned int batch=0; batch<number_of_batches; batch++ ){
+	
+	complext<REAL>sample_val = samples[sampleIdx+batch*gridDim.y*number_of_samples];
+
+	// Apply filter to shared memory domain. 
+	shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)] += (weight*sample_val.vec[0]);
+	shared_mem[sharedMemFirstCellIdx+(batch<<double_warp_size_power)+warpSize] += (weight*sample_val.vec[1]);
+      }
+    }
+}
+
+//
+// kernel main
+//
+
+template<class REAL, unsigned int D> __global__ void
+NFFT_H_convolve_kernel( typename reald<REAL,D>::Type alpha, typename reald<REAL,D>::Type beta, REAL W,
+			vector_td<unsigned int,D> domain_count_grid, unsigned int number_of_samples, unsigned int number_of_batches,
+			vector_td<REAL,D> *traj_positions, complext<REAL>*image, complext<REAL>*samples,
+			unsigned int *tuples_last, unsigned int *bucket_begin, unsigned int *bucket_end, unsigned int double_warp_size_power, 
+			REAL half_W, REAL one_over_W, vector_td<REAL,D> matrix_size_os_real )
+{
+  
+  // Global thread index
+  const unsigned int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // Number of domains
+  const unsigned int number_of_domains = prod(domain_count_grid);
+
+  // Check if we are within bounds
+  if( index >= number_of_domains )
+    return;
+  
+  // Mapped global thread index (actually we don't use a map currently)
+  const unsigned int domainIdx = index; 
+
+  // Compute global domain position
+  const vector_td<unsigned int,D> domainPos = idx_to_co<D>( domainIdx, domain_count_grid );
+	
+  // Number of cells
+  const unsigned int num_reals = number_of_batches<<1;
+
+  // All shared memory floats corresponding to domain 'threadIdx.x' is located in bank threadIdx.x%warp_size to limit bank conflicts
+  const unsigned int scatterSharedMemStart = (threadIdx.x/warpSize)*warpSize;
+  const unsigned int scatterSharedMemStartOffset = threadIdx.x&(warpSize-1); // a faster way of saying (threadIdx.x%warpSize) 
+  const unsigned int sharedMemFirstCellIdx = scatterSharedMemStart*num_reals + scatterSharedMemStartOffset;
+
+  REAL *shared_mem = (REAL*) _shared_mem;
+
+  // Initialize shared memory
+  for( unsigned int i=0; i<num_reals; i++ )
+    shared_mem[sharedMemFirstCellIdx+warpSize*i] = REAL(0);
+  
+  // Compute NFFT using arbitrary sample trajectories.
+  NFFT_H_convolve<REAL, D>
+    ( alpha, beta, W, number_of_samples, number_of_batches, number_of_domains,
+      traj_positions, samples, tuples_last, bucket_begin, bucket_end,
+      double_warp_size_power, half_W, one_over_W,  matrix_size_os_real, index, domainPos, sharedMemFirstCellIdx );
+  
+  // Output k-space image to global memory
+  NFFT_H_output<REAL>( number_of_batches, image, double_warp_size_power, number_of_domains, index, sharedMemFirstCellIdx );
+}
diff --git a/toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu b/toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu
new file mode 100644
index 0000000..95ecbcc
--- /dev/null
+++ b/toolboxes/nfft/gpu/NFFT_preprocess_kernel.cu
@@ -0,0 +1,171 @@
+//
+// NFFT_H preprocessing kernels
+//
+
+// convert input trajectory in [-1/2;1/2] to [0;matrix_size_os+matrix_size_wrap]
+
+template<class REAL, unsigned int D> struct trajectory_scale
+{
+  typename reald<REAL,D>::Type matrix, bias;
+  
+  trajectory_scale( const typename reald<REAL,D>::Type &m, const typename reald<REAL,D>::Type &b ){
+    matrix = m;
+    bias = b;
+  }
+  
+  __host__ __device__
+  typename reald<REAL,D>::Type operator()(const typename reald<REAL,D>::Type &in) const { 
+    return component_wise_mul<REAL,D>(in,matrix)+bias;
+  }
+};
+
+template<class REAL, unsigned int D>
+struct compute_num_cells_per_sample
+{
+  __host__ __device__
+  compute_num_cells_per_sample(REAL _half_W) : half_W(_half_W) {}
+  
+  __host__ __device__
+  unsigned int operator()(typename reald<REAL,D>::Type p) const
+  {
+    unsigned int num_cells = 1;
+    for( unsigned int dim=0; dim<D; dim++ ){
+      unsigned int upper_limit = (unsigned int)floor((((float*)&p)[dim])+half_W);
+      unsigned int lower_limit = (unsigned int)ceil((((float*)&p)[dim])-half_W);
+      num_cells *= (upper_limit-lower_limit+1);
+    }
+    return num_cells;
+  }
+  
+  REAL half_W;
+};
+
+template<class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,1>::Type &p, typename uintd<1>::Type &matrix_size_os, typename uintd<1>::Type &matrix_size_wrap, 
+	      REAL half_W, unsigned int *write_offsets, unsigned int *tuples_first, unsigned int *tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+    typename uintd<1>::Type co; co.vec[0] = x;
+    tuples_first[write_offset+pair_idx] = co_to_idx<1>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+    tuples_last[write_offset+pair_idx] = sample_idx;
+    pair_idx++;
+  }
+}
+
+template<class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,2>::Type &p, typename uintd<2>::Type &matrix_size_os, typename uintd<2>::Type &matrix_size_wrap, 
+	      REAL half_W, unsigned int *write_offsets, unsigned int *tuples_first, unsigned int *tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int lower_limit_y = (unsigned int)ceil(p.vec[1]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+  unsigned int upper_limit_y = (unsigned int)floor(p.vec[1]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int y=lower_limit_y; y<=upper_limit_y; y++ ){
+    for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+      typename uintd<2>::Type co; co.vec[0] = x; co.vec[1] = y;
+      tuples_first[write_offset+pair_idx] = co_to_idx<2>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+      tuples_last[write_offset+pair_idx] = sample_idx;
+      pair_idx++;
+    }
+  }
+}
+
+template <class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,3>::Type &p, typename uintd<3>::Type &matrix_size_os, typename uintd<3>::Type &matrix_size_wrap, 
+	      REAL half_W, unsigned int *write_offsets, unsigned int *tuples_first, unsigned int *tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int lower_limit_y = (unsigned int)ceil(p.vec[1]-half_W);
+  unsigned int lower_limit_z = (unsigned int)ceil(p.vec[2]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+  unsigned int upper_limit_y = (unsigned int)floor(p.vec[1]+half_W);
+  unsigned int upper_limit_z = (unsigned int)floor(p.vec[2]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int z=lower_limit_z; z<=upper_limit_z; z++ ){
+    for( unsigned int y=lower_limit_y; y<=upper_limit_y; y++ ){
+      for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+	typename uintd<3>::Type co; co.vec[0] = x; co.vec[1] = y; co.vec[2] = z;
+	tuples_first[write_offset+pair_idx] = co_to_idx<3>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+	tuples_last[write_offset+pair_idx] = sample_idx;
+	pair_idx++;
+      }
+    }
+  }
+}
+
+template <class REAL> __inline__ __device__ void
+output_pairs( unsigned int sample_idx, unsigned int frame, 
+	      typename reald<REAL,4>::Type &p, typename uintd<4>::Type &matrix_size_os, typename uintd<4>::Type &matrix_size_wrap, 
+	      REAL half_W, unsigned int *write_offsets, unsigned int *tuples_first, unsigned int *tuples_last )
+{
+  unsigned int lower_limit_x = (unsigned int)ceil(p.vec[0]-half_W);
+  unsigned int lower_limit_y = (unsigned int)ceil(p.vec[1]-half_W);
+  unsigned int lower_limit_z = (unsigned int)ceil(p.vec[2]-half_W);
+  unsigned int lower_limit_w = (unsigned int)ceil(p.vec[3]-half_W);
+  unsigned int upper_limit_x = (unsigned int)floor(p.vec[0]+half_W);
+  unsigned int upper_limit_y = (unsigned int)floor(p.vec[1]+half_W);
+  unsigned int upper_limit_z = (unsigned int)floor(p.vec[2]+half_W);
+  unsigned int upper_limit_w = (unsigned int)floor(p.vec[3]+half_W);
+
+  unsigned int pair_idx = 0;
+  unsigned int write_offset = (sample_idx==0) ? 0 : write_offsets[sample_idx-1];
+  unsigned int frame_offset = frame*prod(matrix_size_os+matrix_size_wrap);
+  for( unsigned int w=lower_limit_w; w<=upper_limit_w; w++ ){
+    for( unsigned int z=lower_limit_z; z<=upper_limit_z; z++ ){
+      for( unsigned int y=lower_limit_y; y<=upper_limit_y; y++ ){
+	for( unsigned int x=lower_limit_x; x<=upper_limit_x; x++ ){
+	  typename uintd<4>::Type co; co.vec[0] = x; co.vec[1] = y; co.vec[2] = z; co.vec[3] = w;
+	  tuples_first[write_offset+pair_idx] = co_to_idx<4>(co, matrix_size_os+matrix_size_wrap)+frame_offset;
+	  tuples_last[write_offset+pair_idx] = sample_idx;
+	  pair_idx++;
+	}
+      }
+    }
+  }
+}
+
+template<class REAL, unsigned int D> __global__ void
+write_pairs_kernel( typename uintd<D>::Type matrix_size_os, typename uintd<D>::Type matrix_size_wrap, unsigned int num_samples_per_frame, REAL half_W, 
+		    typename reald<REAL,D>::Type *traj_positions, unsigned int *write_offsets, unsigned int *tuples_first, unsigned int *tuples_last )
+{
+  // Get sample idx
+  unsigned int sample_idx = blockIdx.x*blockDim.x + threadIdx.x;
+  unsigned int frame = blockIdx.y;
+
+  if( sample_idx<num_samples_per_frame ){
+
+    sample_idx += frame*num_samples_per_frame;
+    typename reald<REAL,D>::Type p = traj_positions[sample_idx];
+    output_pairs<REAL>( sample_idx, frame, p, matrix_size_os, matrix_size_wrap, half_W, write_offsets, tuples_first, tuples_last );
+  }
+};
+
+template <class REAL, unsigned int D> void 
+write_pairs( typename uintd<D>::Type matrix_size_os, typename uintd<D>::Type matrix_size_wrap, unsigned int num_samples_per_frame, unsigned int num_frames, REAL W, 
+	     typename reald<REAL,D>::Type *traj_positions, unsigned int *write_offsets, unsigned int *tuples_first, unsigned int *tuples_last )
+{  
+  dim3 blockDim(256);
+  dim3 gridDim((int)ceil((double)num_samples_per_frame/(double)blockDim.x), num_frames);
+
+  REAL half_W = REAL(0.5)*W;
+  write_pairs_kernel<REAL,D><<< gridDim, blockDim >>>
+    ( matrix_size_os, matrix_size_wrap, num_samples_per_frame, half_W, traj_positions, write_offsets, tuples_first, tuples_last );
+
+ CHECK_FOR_CUDA_ERROR();
+}
diff --git a/toolboxes/nfft/gpu/cuNFFT.cu b/toolboxes/nfft/gpu/cuNFFT.cu
new file mode 100644
index 0000000..e3d3b14
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFT.cu
@@ -0,0 +1,1457 @@
+/*
+  CUDA implementation of the NFFT.
+
+  -----------
+
+  Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+  T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+  IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+
+  Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+  T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+  IEEE Transactions on Medical Imaging 2009; 28(12): 1974-1985. 
+*/
+
+// Includes - Gadgetron
+#include "cuNFFT.h"
+#include "cuNDFFT.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "vector_td_utilities.h"
+#include "vector_td_io.h"
+#include "cudaDeviceManager.h"
+#include "check_CUDA.h"
+
+// Includes - CUDA
+#include <device_functions.h>
+#include <math_constants.h>
+#include <cufft.h>
+
+// Includes - Thrust
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/binary_search.h>
+
+// Includes - stdlibs
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+using namespace std;
+using namespace thrust;
+using namespace Gadgetron;
+
+// Kernel configuration  
+#define NFFT_MAX_COILS_COMPUTE_1x    8
+#define NFFT_MAX_COILS_COMPUTE_2x   16
+#define NFFT_THREADS_PER_KERNEL    192
+
+// Reference to shared memory
+extern __shared__ char _shared_mem[];
+
+// Includes containing the NFFT convolution implementation
+#include "KaiserBessel_kernel.cu"
+#include "NFFT_C2NC_conv_kernel.cu"
+#include "NFFT_NC2C_conv_kernel.cu"
+#include "NFFT_NC2C_atomic_conv_kernel.cu"
+#include "NFFT_preprocess_kernel.cu"
+
+// Default template arguments requires c++-0x ?
+typedef float dummy;
+
+// The declaration of atomic/non-atomic NC2C convolution
+// We would love to hide this inside the class, but the compiler core dumps on us when we try...
+//
+template<class REAL, unsigned int D, bool ATOMICS> struct _convolve_NFFT_NC2C{
+  static bool apply( cuNFFT_plan<REAL,D,ATOMICS> *plan, 
+                     cuNDArray<complext<REAL> > *in, 
+                     cuNDArray<complext<REAL> > *out, 
+                     bool accumulate );
+};
+
+// Common multi-device handling: prepare
+//
+template<class I1, class I2, class I3>
+static bool prepare( int device, int *old_device, 
+                     cuNDArray<I1> *in1,       cuNDArray<I1> **in1_int,
+                     cuNDArray<I2> *in2 = 0x0, cuNDArray<I2> **in2_int = 0x0,
+                     cuNDArray<I3> *in3 = 0x0, cuNDArray<I3> **in3_int = 0x0 )
+{
+  // Get current Cuda device
+  if( cudaGetDevice(old_device) != cudaSuccess ) {
+    throw cuda_error("Error: cuNFFT : unable to get device no");
+  }
+
+  if( device != *old_device && cudaSetDevice(device) != cudaSuccess) {
+    throw cuda_error("Error : cuNFFT : unable to set device no");
+  }
+  
+  // Transfer arrays to compute device if necessary
+  if( in1 ){
+    if( device != in1->get_device() )
+      *in1_int = new cuNDArray<I1>(*in1); // device transfer
+    else
+      *in1_int = in1;
+  }
+  
+  if( in2 ){
+    if( device != in2->get_device() )
+      *in2_int = new cuNDArray<I2>(*in2); // device transfer
+    else
+      *in2_int = in2;
+  }
+
+  if( in3 ){
+    if( device != in3->get_device() )
+      *in3_int = new cuNDArray<I3>(*in3); // device transfer
+    else
+      *in3_int = in3;
+  }
+  
+  return true;
+}  
+
+// Common multi-device handling: restore
+//
+template<class I1, class I2, class I3>
+static bool restore( int old_device, cuNDArray<I1> *out, 
+                     cuNDArray<I1> *in1, cuNDArray<I1> *in1_int,
+                     cuNDArray<I2> *in2 = 0x0, cuNDArray<I2> *in2_int = 0x0,
+                     cuNDArray<I3> *in3 = 0x0, cuNDArray<I3> *in3_int = 0x0 )
+{
+  if( in1 && out && out->get_device() != in1_int->get_device() ){ 
+    *out = *in1_int; // device transfer by assignment
+  } 
+  
+  // Check if internal array needs deletion (they do only if they were created in ::prepare()
+  //
+  if( in1 && in1->get_device() != in1_int->get_device() ){
+    delete in1_int;
+  }   
+  if( in2 && in2->get_device() != in2_int->get_device() ){
+    delete in2_int;
+  }   
+  if( in3 && in3->get_device() != in3_int->get_device() ){
+    delete in3_int;
+  }   
+  
+  // Get current Cuda device
+  int device;
+  if( cudaGetDevice(&device) != cudaSuccess ) {
+    throw cuda_error("Error: cuNFFT : unable to get device no");
+  }
+  
+  // Restore old device
+  if( device != old_device && cudaSetDevice(old_device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT : unable to restore device no");
+  }
+  
+  return true;
+}
+
+
+//
+// Public class methods
+//
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::cuNFFT_plan()
+{
+  // Minimal initialization
+  barebones();
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::cuNFFT_plan( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W, int device )
+{
+  // Minimal initialization
+  barebones();
+
+  // Setup plan
+  setup( matrix_size, matrix_size_os, W, device );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::~cuNFFT_plan()
+{
+  wipe(NFFT_WIPE_ALL);
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W, int _device )
+{
+  // Free memory
+  wipe(NFFT_WIPE_ALL);
+
+  //
+  // Check if the device is valid
+  //
+
+  if( _device<0 ){
+    if( cudaGetDevice( &device ) != cudaSuccess ){
+      throw cuda_error("Error: cuNFFT_plan::setup: unable to determine device properties.");
+    }
+  }
+  else
+    device = _device;
+
+  // The convolution does not work properly for very small convolution kernel widths
+  // (experimentally observed limit)
+
+  if( W < REAL(1.8) ) {
+    throw std::runtime_error("Error: the convolution kernel width for the cuNFFT plan is too small.");
+  }
+
+  typename uint64d<D>::Type vec_warp_size( (size_t)(cudaDeviceManager::Instance()->warp_size(device)) );
+
+  //
+  // Check input against certain requirements
+  //
+  
+  if( sum(matrix_size%vec_warp_size) || sum(matrix_size_os%vec_warp_size) ){
+    //std::cout << "Matrix size: " << matrix_size << std::endl;
+    //std::cout << "Matrix size os: " << matrix_size_os << std::endl;
+    //std::cout << "Warp size: " << vec_warp_size << std::endl;
+    throw std::runtime_error("Error: Illegal matrix size for the cuNFFT plan (not a multiple of the warp size)");
+  }
+
+  //
+  // Setup private variables
+  //
+
+  this->matrix_size = matrix_size;
+  this->matrix_size_os = matrix_size_os;
+
+  REAL W_half = REAL(0.5)*W;
+  vector_td<REAL,D> W_vec(W_half);
+
+  matrix_size_wrap = vector_td<size_t,D>( ceil(W_vec) );
+  matrix_size_wrap<<=1; 
+  
+  alpha = vector_td<REAL,D>(matrix_size_os) / vector_td<REAL,D>(matrix_size);
+  
+  typename reald<REAL,D>::Type ones(1);
+  if( weak_less( alpha, ones ) ){
+    throw std::runtime_error("Error: cuNFFT : Illegal oversampling ratio suggested");
+  }
+
+  this->W = W;
+  
+  // Compute Kaiser-Bessel beta
+  compute_beta();
+  
+  int device_no_old;
+  if (cudaGetDevice(&device_no_old) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::setup: unable to get device no");
+  }  
+  if( device != device_no_old && cudaSetDevice(device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::setup: unable to set device");
+  }  
+
+  // Calculate deapodization filter
+  compute_deapodization_filter();
+  
+  initialized = true;
+
+  if( device != device_no_old && cudaSetDevice(device_no_old) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::setup: unable to restore device");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory, NFFT_prep_mode mode )
+{
+  if( !trajectory || trajectory->get_number_of_elements()==0 ){
+    throw std::runtime_error("Error: cuNFFT_plan::preprocess: invalid trajectory");
+  }
+  
+  if( !initialized ){
+    throw std::runtime_error("Error: cuNFFT_plan::preprocess: cuNFFT_plan::setup must be invoked prior to preprocessing.");
+  }
+
+  wipe(NFFT_WIPE_PREPROCESSING);
+
+  cuNDArray<typename reald<REAL,D>::Type> *trajectory_int;
+  int old_device;
+
+  if( !prepare<typename reald<REAL,D>::Type,dummy,dummy>(device, &old_device, trajectory, &trajectory_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::preprocess: device preparation error.");
+  }
+    
+  number_of_samples = trajectory_int->get_size(0);
+  number_of_frames = trajectory_int->get_number_of_elements()/number_of_samples;
+
+  // Make sure that the trajectory values are within range [-1/2;1/2]
+  thrust::pair< thrust::device_ptr<REAL>, thrust::device_ptr<REAL> > mm_pair = 
+    thrust::minmax_element( device_pointer_cast<REAL>((REAL*)trajectory_int->get_data_ptr()), 
+                            device_pointer_cast<REAL>(((REAL*)trajectory_int->get_data_ptr())+trajectory_int->get_number_of_elements()*D ));
+  
+  if( *mm_pair.first < REAL(-0.5) || *mm_pair.second > REAL(0.5) ){
+    throw std::runtime_error("Error: cuNFFT::preprocess : trajectory out of range [-1/2;1/2]");
+  }
+  
+  // Make Thrust device vector of trajectory and samples
+  device_vector< vector_td<REAL,D> > trajectory_positions_in
+    ( device_pointer_cast< vector_td<REAL,D> >(trajectory_int->get_data_ptr()), 
+      device_pointer_cast< vector_td<REAL,D> >(trajectory_int->get_data_ptr()+trajectory_int->get_number_of_elements() ));
+  
+  trajectory_positions = new device_vector< vector_td<REAL,D> >( trajectory_int->get_number_of_elements() );
+
+  CHECK_FOR_CUDA_ERROR();
+
+  vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>( matrix_size_os );
+  vector_td<REAL,D> matrix_size_os_plus_wrap_real = vector_td<REAL,D>( (matrix_size_os+matrix_size_wrap)>>1 );
+
+  // convert input trajectory in [-1/2;1/2] to [0;matrix_size_os]
+  thrust::transform( trajectory_positions_in.begin(), trajectory_positions_in.end(), trajectory_positions->begin(), 
+                     trajectory_scale<REAL,D>(matrix_size_os_real, matrix_size_os_plus_wrap_real) );
+  
+  CHECK_FOR_CUDA_ERROR();
+
+  if( !( mode == NFFT_PREP_C2NC || ATOMICS )){
+
+    // allocate storage for and compute temporary prefix-sum variable (#cells influenced per sample)
+    device_vector<unsigned int> c_p_s(trajectory_int->get_number_of_elements());
+    device_vector<unsigned int> c_p_s_ps(trajectory_int->get_number_of_elements());
+    CHECK_FOR_CUDA_ERROR();
+    
+    REAL half_W = REAL(0.5)*W;
+    thrust::plus<unsigned int> binary_op;
+    thrust::transform(trajectory_positions->begin(), trajectory_positions->end(), c_p_s.begin(), compute_num_cells_per_sample<REAL,D>(half_W));
+    inclusive_scan( c_p_s.begin(), c_p_s.end(), c_p_s_ps.begin(), binary_op ); // prefix sum
+    
+    // Build the vector of (grid_idx, sample_idx) tuples. Actually kept in two seperate vectors.
+    unsigned int num_pairs = c_p_s_ps.back();
+    c_p_s.clear();
+
+    thrust::device_vector<unsigned int> *tuples_first = new device_vector<unsigned int>(num_pairs);
+    tuples_last = new device_vector<unsigned int>(num_pairs);
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    // Fill tuple vector
+    write_pairs<REAL,D>( vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), number_of_samples, number_of_frames, W,
+                         raw_pointer_cast(&(*trajectory_positions)[0]), raw_pointer_cast(&c_p_s_ps[0]), 
+                         raw_pointer_cast(&(*tuples_first)[0]), raw_pointer_cast(&(*tuples_last)[0]) );
+    c_p_s_ps.clear();
+
+    // Sort by grid indices
+    sort_by_key( tuples_first->begin(), tuples_first->end(), tuples_last->begin() );
+    
+    // each bucket_begin[i] indexes the first element of bucket i's list of points
+    // each bucket_end[i] indexes one past the last element of bucket i's list of points
+
+    bucket_begin = new device_vector<unsigned int>(number_of_frames*prod(matrix_size_os+matrix_size_wrap));
+    bucket_end   = new device_vector<unsigned int>(number_of_frames*prod(matrix_size_os+matrix_size_wrap));
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    // find the beginning of each bucket's list of points
+    counting_iterator<unsigned int> search_begin(0);
+    lower_bound(tuples_first->begin(), tuples_first->end(), search_begin, search_begin + number_of_frames*prod(matrix_size_os+matrix_size_wrap), bucket_begin->begin() );
+    
+    // find the end of each bucket's list of points
+    upper_bound(tuples_first->begin(), tuples_first->end(), search_begin, search_begin + number_of_frames*prod(matrix_size_os+matrix_size_wrap), bucket_end->begin() );
+  
+    delete tuples_first;
+  }
+
+  preprocessed_C2NC = true;
+
+  if( mode != NFFT_PREP_C2NC )
+    preprocessed_NC2C = true;
+
+  if( !restore<typename reald<REAL,D>::Type,dummy,dummy>(old_device, trajectory, trajectory, trajectory_int) ){
+    throw cuda_error("Error: cuNFFT_plan::preprocess: unable to restore compute device.");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                                                 cuNDArray<REAL> *dcw, NFFT_comp_mode mode )
+{  
+  // Validity checks
+  
+  unsigned char components;
+
+  if( mode == NFFT_FORWARDS_C2NC ) 
+    components = _NFFT_CONV_C2NC + _NFFT_FFT + _NFFT_DEAPODIZATION;
+
+  else if( mode == NFFT_FORWARDS_NC2C ) 
+    components = _NFFT_CONV_NC2C + _NFFT_FFT + _NFFT_DEAPODIZATION;
+
+  else if( mode == NFFT_BACKWARDS_NC2C ) 
+    components = _NFFT_CONV_NC2C + _NFFT_FFT + _NFFT_DEAPODIZATION;
+
+  else if( mode == NFFT_BACKWARDS_C2NC ) 
+    components = _NFFT_CONV_C2NC + _NFFT_FFT + _NFFT_DEAPODIZATION;
+  else{
+    throw std::runtime_error("Error: cuNFFT_plan::compute: unknown mode");
+  }
+  
+  {
+    cuNDArray<complext<REAL> > *samples, *image;
+
+    if( mode == NFFT_FORWARDS_C2NC || mode == NFFT_BACKWARDS_C2NC ){
+      image = in; samples = out;
+    } else{
+      image = out; samples = in;
+    }
+    
+    check_consistency( samples, image, dcw, components );
+  }
+  
+  cuNDArray<complext<REAL> > *in_int = 0x0, *out_int = 0x0;
+  cuNDArray<REAL> *dcw_int = 0x0;
+  int old_device;
+
+  if( !prepare<complext<REAL>, complext<REAL>, REAL>
+      (device, &old_device, in, &in_int, out, &out_int, dcw, &dcw_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::compute: device preparation error.");
+  }
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t,D>
+    ( (mode == NFFT_FORWARDS_C2NC || mode == NFFT_BACKWARDS_C2NC ) ? *in->get_dimensions() : *out->get_dimensions() );
+  bool oversampled_image = (image_dims==matrix_size_os);
+  
+  vector<size_t> vec_dims = to_std_vector(matrix_size_os);
+  {
+    cuNDArray<complext<REAL> > *image = ((mode == NFFT_FORWARDS_C2NC || mode == NFFT_BACKWARDS_C2NC ) ? in : out );
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      vec_dims.push_back(image->get_size(d));
+  }
+
+  cuNDArray<complext<REAL> > *working_image = 0x0, *working_samples = 0x0;
+
+  switch(mode){
+
+  case NFFT_FORWARDS_C2NC:
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+      pad<complext<REAL>, D>( in_int, working_image );
+    }
+    else{
+      working_image = in_int;
+    }
+    
+    compute_NFFT_C2NC( working_image, out_int );
+
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }    
+    break;
+    
+  case NFFT_FORWARDS_NC2C:
+
+    // Density compensation
+    if( dcw_int ){
+      working_samples = new cuNDArray<complext<REAL> >(*in_int);
+      *working_samples *= *dcw_int;
+    }
+    else{
+      working_samples = in_int;
+    }
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+    }
+    else{
+      working_image = out_int;
+    }
+
+    compute_NFFT_NC2C( working_samples, working_image );
+
+    if( !oversampled_image ){
+      crop<complext<REAL>, D>( (matrix_size_os-matrix_size)>>1, working_image, out_int );
+    }
+    
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }
+    
+    if( dcw_int ){
+      delete working_samples; working_samples = 0x0;
+    }    
+    break;
+    
+  case NFFT_BACKWARDS_NC2C:
+    
+    // Density compensation
+    if( dcw_int ){
+      working_samples = new cuNDArray<complext<REAL> >(*in_int);
+      *working_samples *= *dcw_int;
+    }
+    else{
+      working_samples = in_int;
+    }
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+    }
+    else{
+      working_image = out_int;
+    }
+    
+    compute_NFFTH_NC2C( working_samples, working_image );
+    
+    if( !oversampled_image ){
+      crop<complext<REAL> ,D>( (matrix_size_os-matrix_size)>>1, working_image, out_int );
+    }
+    
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }
+    
+    if( dcw_int ){
+      delete working_samples; working_samples = 0x0;
+    }    
+    break;
+    
+  case NFFT_BACKWARDS_C2NC:
+    
+    if( !oversampled_image ){
+      working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+      
+      pad<complext<REAL>, D>( in_int, working_image );
+    }
+    else{
+      working_image = in_int;
+    }
+    
+    compute_NFFTH_C2NC( working_image, out_int );
+    
+    if( !oversampled_image ){
+      delete working_image; working_image = 0x0;
+    }
+    
+    break;
+  };
+  
+  if( !restore<complext<REAL> ,complext<REAL> ,REAL>
+      (old_device, out, out, out_int, in, in_int, dcw, dcw_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::compute: unable to restore compute device.");
+  }
+  
+  CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::mult_MH_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                                                   cuNDArray<REAL> *dcw, std::vector<size_t> halfway_dims )
+{
+  // Validity checks
+  
+  unsigned char components = _NFFT_CONV_C2NC + _NFFT_CONV_NC2C + _NFFT_FFT + _NFFT_DEAPODIZATION;
+  
+  if( in->get_number_of_elements() != out->get_number_of_elements() ){
+    throw std::runtime_error("Error: cuNFFT_plan::mult_MH_M: in/out image sizes mismatch");
+  }
+  
+  cuNDArray<complext<REAL> > *working_samples = new cuNDArray<complext<REAL> >(&halfway_dims);
+
+  check_consistency( working_samples, in, dcw, components );
+  
+  cuNDArray<complext<REAL> > *in_int = 0x0;
+  cuNDArray<complext<REAL> > *out_int = 0x0;
+  cuNDArray<REAL> *dcw_int = 0x0;
+  int old_device;
+  
+  if( !prepare<complext<REAL>, complext<REAL>, REAL>
+      (device, &old_device, in, &in_int, out, &out_int, dcw, &dcw_int ) ){
+    throw cuda_error("Error: cuNFFT_plan::mult_MH_M: device preparation error.");
+  }
+  
+  cuNDArray<complext<REAL> > *working_image = 0x0;
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t,D>(*in->get_dimensions()); 
+  bool oversampled_image = (image_dims==matrix_size_os); 
+ 
+  vector<size_t> vec_dims = to_std_vector(matrix_size_os); 
+  for( unsigned int d=D; d<in->get_number_of_dimensions(); d++ )
+    vec_dims.push_back(in->get_size(d));
+  
+  if( !oversampled_image ){
+    working_image = new cuNDArray<complext<REAL> >(&vec_dims);
+    pad<complext<REAL>, D>( in_int, working_image );
+  }
+  else{
+    working_image = in_int;
+  }
+  
+  compute_NFFT_C2NC( working_image, working_samples );
+  
+  // Density compensation
+  if( dcw ){
+    *working_samples *= *dcw_int;
+  }
+    
+  compute_NFFTH_NC2C( working_samples, working_image );
+    
+  delete working_samples;
+  working_samples = 0x0;
+    
+  if( !oversampled_image ){
+    crop<complext<REAL>, D>( (matrix_size_os-matrix_size)>>1, working_image, out_int );
+    delete working_image; working_image = 0x0;
+  }
+        
+  restore<complext<REAL> ,complext<REAL> ,REAL>
+    (old_device, out, out, out_int, in, in_int, dcw, dcw_int );
+    
+  CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::convolve( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+                                                  cuNDArray<REAL> *dcw, NFFT_conv_mode mode, bool accumulate )
+{
+  unsigned char components;
+
+  if( mode == NFFT_CONV_C2NC ) 
+    components = _NFFT_CONV_C2NC;
+  else
+    components = _NFFT_CONV_NC2C;
+  
+  {
+    cuNDArray<complext<REAL> > *samples, *image;
+    
+    if( mode == NFFT_CONV_C2NC ){
+      image = in; samples = out;
+    } else{
+      image = out; samples = in;
+    }
+    
+    check_consistency( samples, image, dcw, components );
+  }
+  
+  cuNDArray<complext<REAL> > *in_int = 0x0, *out_int = 0x0;
+  cuNDArray<REAL> *dcw_int = 0x0;
+  int old_device;
+  
+  prepare<complext<REAL>, complext<REAL>, REAL>
+    (device, &old_device, in, &in_int, out, &out_int, dcw, &dcw_int );
+  
+  cuNDArray<complext<REAL> > *working_samples = 0x0;
+  
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t, D>
+    (*(((mode == NFFT_CONV_C2NC) ? in : out )->get_dimensions())); 
+  bool oversampled_image = (image_dims==matrix_size_os); 
+  
+  if( !oversampled_image ){
+    throw std::runtime_error("Error: cuNFFT_plan::convolve: ERROR: oversampled image not provided as input.");
+  }
+
+  vector<size_t> vec_dims = to_std_vector(matrix_size_os); 
+  {
+    cuNDArray<complext<REAL> > *image = ((mode == NFFT_CONV_C2NC) ? in : out );
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      vec_dims.push_back(image->get_size(d));
+  }
+
+  switch(mode){
+
+  case NFFT_CONV_C2NC:
+  	convolve_NFFT_C2NC( in_int, out_int, accumulate );
+    break;
+    
+  case NFFT_CONV_NC2C:
+
+    // Density compensation
+    if( dcw_int ){
+      working_samples = new cuNDArray<complext<REAL> >(*in_int);
+      *working_samples *= *dcw_int;
+    }
+    else{
+      working_samples = in_int;
+    }
+    
+    _convolve_NFFT_NC2C<REAL,D,ATOMICS>::apply( this, working_samples, out_int, accumulate );
+    
+    if( dcw_int ){
+      delete working_samples; working_samples = 0x0;
+    }    
+    break;
+
+  default:
+    throw std::runtime_error( "Error: cuNFFT_plan::convolve: unknown mode.");
+  }
+
+  restore<complext<REAL>, complext<REAL>, REAL>
+    (old_device, out, out, out_int, in, in_int, dcw, dcw_int );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::fft(cuNDArray<complext<REAL> > *data, NFFT_fft_mode mode, bool do_scale )
+{
+  cuNDArray<complext<REAL> > *data_int = 0x0;
+  int old_device;
+  
+  prepare<complext<REAL>,dummy,dummy>( device, &old_device, data, &data_int );
+  
+  typename uint64d<D>::Type _dims_to_transform = counting_vec<size_t,D>();
+  vector<size_t> dims_to_transform = to_std_vector( _dims_to_transform );
+  
+  if( mode == NFFT_FORWARDS ){
+    cuNDFFT<REAL>::instance()->fft( data_int, &dims_to_transform );
+  }
+  else{
+    cuNDFFT<REAL>::instance()->ifft( data_int, &dims_to_transform, do_scale );
+  }
+
+  restore<complext<REAL> ,dummy,dummy>(old_device, data, data, data_int);
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::deapodize( cuNDArray<complext<REAL> > *image )
+{
+  unsigned char components;
+  components = _NFFT_FFT;
+  check_consistency( 0x0, image, 0x0, components );
+
+  cuNDArray<complext<REAL> > *image_int = 0x0;
+  int old_device;
+  
+  prepare<complext<REAL>,dummy,dummy>(device, &old_device, image, &image_int );
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t, D>(*image->get_dimensions()); 
+  bool oversampled_image = (image_dims==matrix_size_os); 
+  
+  if( !oversampled_image ){
+    throw std::runtime_error( "Error: cuNFFT_plan::deapodize: ERROR: oversampled image not provided as input.");
+  }
+  *image_int *= *deapodization_filter;
+    
+  restore<complext<REAL> ,dummy,dummy>(old_device, image, image, image_int);
+}
+
+//
+// Private class methods
+//
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::check_consistency( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image,
+                                                           cuNDArray<REAL> *weights, unsigned char components )
+{
+
+  if( !initialized ){
+    throw std::runtime_error( "Error: cuNFFT_plan: Unable to proceed without setup.");
+  }
+  
+  if( (components & _NFFT_CONV_C2NC ) && !preprocessed_C2NC ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to compute NFFT before preprocessing.");
+  }
+  
+  if( (components & _NFFT_CONV_NC2C ) && !(preprocessed_NC2C || (preprocessed_C2NC && ATOMICS ) ) ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to compute NFFT before preprocessing.");
+  }
+  
+  if( ((components & _NFFT_CONV_C2NC ) || (components & _NFFT_CONV_NC2C )) && !(image && samples) ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to process 0x0 input/output.");
+  }
+  
+  if( ((components & _NFFT_FFT) || (components & _NFFT_DEAPODIZATION )) && !image ){
+    throw std::runtime_error("Error: cuNFFT_plan: Unable to process 0x0 input.");
+  }
+
+  if( image->get_number_of_dimensions() < D ){
+    throw std::runtime_error("Error: cuNFFT_plan: Number of image dimensions mismatch the plan.");
+  }    
+
+  typename uint64d<D>::Type image_dims = from_std_vector<size_t,D>( *image->get_dimensions() );
+  bool oversampled_image = (image_dims==matrix_size_os);
+  
+  if( !((oversampled_image) ? (image_dims == matrix_size_os) : (image_dims == matrix_size) )){
+    throw std::runtime_error("Error: cuNFFT_plan: Image dimensions mismatch.");
+  }
+  
+  if( (components & _NFFT_CONV_C2NC ) || (components & _NFFT_CONV_NC2C )){    
+    if( (samples->get_number_of_elements() == 0) || (samples->get_number_of_elements() % (number_of_frames*number_of_samples)) ){
+      throw std::runtime_error("Error: cuNFFT_plan: The number of samples is not a multiple of #samples/frame x #frames as requested through preprocessing");
+    }
+    
+    unsigned int num_batches_in_samples_array = samples->get_number_of_elements()/(number_of_frames*number_of_samples);
+    unsigned int num_batches_in_image_array = 1;
+
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ ){
+      num_batches_in_image_array *= image->get_size(d);
+    }
+    num_batches_in_image_array /= number_of_frames;
+
+    if( num_batches_in_samples_array != num_batches_in_image_array ){
+      printf("\ncuNFFT::check_consistency() failed:\n#elements in the samples array: %ld.\n#samples from preprocessing: %d.\n#frames from preprocessing: %d.\nLeading to %d batches in the samples array.\nThe number of batches in the image array is %d.\n",samples->get_number_of_elements(), number_of_samples, number_of_frames, num_batches_in_samples_array, num_batches_in_image_array ); fflush(stdout);
+      throw std::runtime_error("Error: cuNFFT_plan: Number of batches mismatch between samples and image arrays");
+    }
+  }
+  
+  if( components & _NFFT_CONV_NC2C ){
+    if( weights ){ 
+      if( weights->get_number_of_elements() == 0 ||
+          !( weights->get_number_of_elements() == number_of_samples || 
+             weights->get_number_of_elements() == number_of_frames*number_of_samples) ){
+        throw std::runtime_error("Error: cuNFFT_plan: The number of weights should match #samples/frame x #frames as requested through preprocessing");
+      }
+    }
+  }  
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::barebones()
+{	
+  // These are the fundamental booleans checked before accessing the various member pointers
+  initialized = preprocessed_C2NC = preprocessed_NC2C = false;
+
+  // Clear matrix sizes
+  clear(matrix_size);
+  clear(matrix_size_os);
+
+  // Clear pointers
+  trajectory_positions = 0x0;
+  tuples_last = bucket_begin = bucket_end = 0x0;
+
+  // and specify the device
+  if (cudaGetDevice(&device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::barebones:: unable to get device no");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::wipe( NFFT_wipe_mode mode )
+{
+  // Get current Cuda device
+  int old_device;
+  if( cudaGetDevice(&old_device) != cudaSuccess ) {
+    throw cuda_error("Error: cuNFFT_plan::wipe: unable to get device no");
+  }
+
+  if( device != old_device && cudaSetDevice(device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::wipe: unable to set device no");
+  }
+
+  if( mode==NFFT_WIPE_ALL && initialized ){
+    deapodization_filter.reset();
+    initialized = false;
+  }
+    
+  if( preprocessed_NC2C ){
+    if( tuples_last )  delete tuples_last;
+    if( bucket_begin ) delete bucket_begin;
+    if( bucket_end )   delete bucket_end;
+  }
+  
+  if( preprocessed_C2NC || preprocessed_NC2C ){
+    delete trajectory_positions;
+    preprocessed_C2NC = preprocessed_NC2C = false;
+  }
+
+  if( device != old_device && cudaSetDevice(old_device) != cudaSuccess) {
+    throw cuda_error("Error: cuNFFT_plan::wipe: unable to restore device no");
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> 
+void Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_beta()
+{	
+  // Compute Kaiser-Bessel beta paramter according to the formula provided in 
+  // Beatty et. al. IEEE TMI 2005;24(6):799-808.
+  for( unsigned int d=0; d<D; d++ )
+    beta[d] = (M_PI*std::sqrt((W*W)/(alpha[d]*alpha[d])*(alpha[d]-REAL(0.5))*(alpha[d]-REAL(0.5))-REAL(0.8))); 
+}
+
+//
+// Grid fictitious trajectory with a single sample at the origin
+//
+
+template<class REAL, unsigned int D> __global__ void
+compute_deapodization_filter_kernel( typename uintd<D>::Type matrix_size_os, typename reald<REAL,D>::Type matrix_size_os_real, 
+                                     REAL W, REAL half_W, REAL one_over_W, 
+                                     typename reald<REAL,D>::Type beta, complext<REAL> *image_os )
+{
+  const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  const unsigned int num_elements = prod(matrix_size_os);
+
+  if( idx <num_elements ){
+
+    // Compute weight from Kaiser-Bessel filter
+    const typename uintd<D>::Type cell_pos = idx_to_co<D>(idx, matrix_size_os);
+
+    // Sample position ("origin")
+    const vector_td<REAL,D> sample_pos = REAL(0.5)*matrix_size_os_real;
+
+    // Calculate the distance between the cell and the sample
+    vector_td<REAL,D> cell_pos_real = vector_td<REAL,D>(cell_pos);
+    const typename reald<REAL,D>::Type delta = abs(sample_pos-cell_pos_real);
+
+    // Compute convolution weight. 
+    REAL weight; 
+    REAL zero = REAL(0);
+    vector_td<REAL,D> half_W_vec( half_W );
+
+    if( weak_greater( delta, half_W_vec ) )
+      weight = zero;
+    else{ 
+      weight = KaiserBessel<REAL>( delta, matrix_size_os_real, one_over_W, beta );
+      //if( !isfinite(weight) )
+      //weight = zero;
+    }
+    
+    // Output weight
+    complext<REAL>  result;
+    result.vec[0] = weight; 
+    result.vec[1] = zero;
+    image_os[idx] = result;
+  }
+}
+
+//
+// Function to calculate the deapodization filter
+//
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_deapodization_filter()
+{
+  std::vector<size_t> tmp_vec_os = to_std_vector(matrix_size_os);
+  deapodization_filter = boost::shared_ptr< cuNDArray<complext<REAL> > >( new cuNDArray<complext<REAL> >);
+  deapodization_filter->create(&tmp_vec_os);
+  vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>(matrix_size_os);
+  
+  // Find dimensions of grid/blocks.
+  dim3 dimBlock( 256 );
+  dim3 dimGrid( (prod(matrix_size_os)+dimBlock.x-1)/dimBlock.x );
+
+  // Invoke kernel
+  compute_deapodization_filter_kernel<REAL,D><<<dimGrid, dimBlock>>> 
+    ( vector_td<unsigned int,D>(matrix_size_os), matrix_size_os_real, W, REAL(0.5)*W, REAL(1)/W, beta, deapodization_filter->get_data_ptr() );
+
+  CHECK_FOR_CUDA_ERROR();
+  
+  // FFT
+  fft( deapodization_filter.get(), NFFT_BACKWARDS, false );
+  
+  // Reciprocal
+  reciprocal_inplace(deapodization_filter.get());
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFT_C2NC( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Deapodization
+  deapodize( image );
+    
+  // FFT
+  fft( image, NFFT_FORWARDS );
+
+  // Convolution
+  convolve( image, samples, 0x0, NFFT_CONV_C2NC );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFTH_NC2C( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Convolution
+  convolve( samples, image, 0x0, NFFT_CONV_NC2C );
+
+  // FFT
+  fft( image, NFFT_BACKWARDS );
+  
+  // Deapodization  
+  deapodize( image );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFTH_C2NC( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Deapodization
+  deapodize( image );
+ 
+  // FFT
+  fft( image, NFFT_BACKWARDS );
+
+  // Convolution
+  convolve( image, samples, 0x0, NFFT_CONV_C2NC );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::compute_NFFT_NC2C( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image )
+{
+  // private method - no consistency check. We trust in ourselves.
+
+  // Convolution
+  convolve( samples, image, 0x0, NFFT_CONV_NC2C );
+  
+  // FFT
+  fft( image, NFFT_FORWARDS );
+  
+  // Deapodization
+  deapodize( image );
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::convolve_NFFT_C2NC( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples, bool accumulate )
+{
+  // private method - no consistency check. We trust in ourselves.
+  
+  unsigned int num_batches = 1;
+  for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+    num_batches *= image->get_size(d);
+  num_batches /= number_of_frames;
+  
+  /*
+    Setup grid and threads
+  */
+
+  size_t threads_per_block;
+  unsigned int max_coils;
+    
+  threads_per_block = NFFT_THREADS_PER_KERNEL;
+  
+  if( cudaDeviceManager::Instance()->major_version(device) == 1 ){
+    max_coils = NFFT_MAX_COILS_COMPUTE_1x;
+  }
+  else{
+    max_coils = NFFT_MAX_COILS_COMPUTE_2x;
+  }
+  
+  // We can (only) convolve max_coils batches per run due to shared memory issues. 
+  unsigned int domain_size_coils_desired = num_batches;
+  unsigned int num_repetitions = domain_size_coils_desired/max_coils + 
+    ( ((domain_size_coils_desired%max_coils)==0) ? 0 : 1 );
+  unsigned int domain_size_coils = (num_repetitions==1) ? domain_size_coils_desired : max_coils;
+  unsigned int domain_size_coils_tail = (num_repetitions==1) ? domain_size_coils_desired : domain_size_coils_desired - (num_repetitions-1)*domain_size_coils;
+
+  // Block and Grid dimensions
+  dim3 dimBlock( (unsigned int)threads_per_block );
+  dim3 dimGrid( (number_of_samples+dimBlock.x-1)/dimBlock.x, number_of_frames );
+
+  // Calculate how much shared memory to use per thread
+  size_t bytes_per_thread = domain_size_coils * sizeof( vector_td<REAL,D> );
+  size_t bytes_per_thread_tail = domain_size_coils_tail * sizeof( vector_td<REAL,D> );
+
+  unsigned int double_warp_size_power=0;
+  unsigned int __tmp = cudaDeviceManager::Instance()->warp_size(device)<<1;
+  while(__tmp!=1){
+    __tmp>>=1;
+    double_warp_size_power++;
+  }
+  
+  vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>( matrix_size_os );
+
+  /*
+    Invoke kernel
+  */
+
+  for( unsigned int repetition = 0; repetition<num_repetitions; repetition++ ){
+    NFFT_convolve_kernel<REAL,D>
+      <<<dimGrid, dimBlock, (repetition==num_repetitions-1) ? dimBlock.x*bytes_per_thread_tail : dimBlock.x*bytes_per_thread>>>
+      ( alpha, beta, W, vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), number_of_samples,
+        (repetition==num_repetitions-1) ? domain_size_coils_tail : domain_size_coils, 
+        raw_pointer_cast(&(*trajectory_positions)[0]), 
+        image->get_data_ptr()+repetition*prod(matrix_size_os)*number_of_frames*domain_size_coils,
+        samples->get_data_ptr()+repetition*number_of_samples*number_of_frames*domain_size_coils, 
+        double_warp_size_power, REAL(0.5)*W, REAL(1)/(W), accumulate, matrix_size_os_real );
+
+    CHECK_FOR_CUDA_ERROR();    
+  }
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::convolve_NFFT_NC2C( cuNDArray<complext<REAL> > *image, cuNDArray<complext<REAL> > *samples, bool accumulate )
+{
+  _convolve_NFFT_NC2C<REAL,D,ATOMICS>::apply( this, image, samples, accumulate );
+}
+
+template<unsigned int D> struct
+_convolve_NFFT_NC2C<float,D,true>{ // True: use atomic operations variant
+  static bool apply( cuNFFT_plan<float,D,true> *plan, 
+                     cuNDArray<complext<float> > *samples, 
+                     cuNDArray<complext<float> > *image, 
+                     bool accumulate )
+  {   
+    //
+    // Bring in some variables from the plan
+    
+    unsigned int device = plan->device;
+    unsigned int number_of_frames = plan->number_of_frames;
+    unsigned int number_of_samples = plan->number_of_samples;
+    typename uint64d<D>::Type matrix_size_os = plan->matrix_size_os;
+    typename uint64d<D>::Type matrix_size_wrap = plan->matrix_size_wrap;
+    typename reald<float,D>::Type alpha = plan->alpha;
+    typename reald<float,D>::Type beta = plan->beta;
+    float W = plan->W;
+    thrust::device_vector< typename reald<float,D>::Type > *trajectory_positions = plan->trajectory_positions;    
+
+    //
+    // Atomic operations are only supported in compute model 2.0 and up
+    //
+
+    if( cudaDeviceManager::Instance()->major_version(device) == 1 ){
+      throw cuda_error("Error: Atomic NC2C NFFT only supported on device with compute model 2.0 or higher");
+    }
+    
+    // Check if warp_size is a power of two. We do some modulus tricks in the kernels that depend on this...
+    if( !((cudaDeviceManager::Instance()->warp_size(device) & (cudaDeviceManager::Instance()->warp_size(device)-1)) == 0 ) ){
+      throw cuda_error("cuNFFT: unsupported hardware (warpSize is not a power of two)");
+    }
+    
+    unsigned int num_batches = 1;
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      num_batches *= image->get_size(d);
+    num_batches /= number_of_frames;
+    
+    //
+    //  Setup grid and threads
+    //
+    
+    size_t threads_per_block;
+    unsigned int max_coils;
+    
+    threads_per_block = NFFT_THREADS_PER_KERNEL;
+    max_coils = NFFT_MAX_COILS_COMPUTE_2x;
+    
+    // We can (only) convolve domain_size_coils batches per run due to shared memory issues. 
+    unsigned int domain_size_coils_desired = num_batches;
+    unsigned int num_repetitions = domain_size_coils_desired/max_coils + 
+      ( ((domain_size_coils_desired%max_coils)==0) ? 0 : 1 );
+    unsigned int domain_size_coils = (num_repetitions==1) ? domain_size_coils_desired : max_coils;
+    unsigned int domain_size_coils_tail = (num_repetitions==1) ? domain_size_coils_desired : domain_size_coils_desired - (num_repetitions-1)*domain_size_coils;
+    
+    // Block and Grid dimensions
+    dim3 dimBlock( (unsigned int)threads_per_block ); 
+    dim3 dimGrid( (number_of_samples+dimBlock.x-1)/dimBlock.x, number_of_frames );
+    
+    // Calculate how much shared memory to use per thread
+    size_t bytes_per_thread = domain_size_coils * sizeof( vector_td<float,D> );
+    size_t bytes_per_thread_tail = domain_size_coils_tail * sizeof( vector_td<float,D> );
+    
+    unsigned int double_warp_size_power=0, __tmp = cudaDeviceManager::Instance()->warp_size(device)<<1;
+    while(__tmp!=1){
+      __tmp>>=1;
+      double_warp_size_power++;
+    }
+    
+    vector_td<float,D> matrix_size_os_real = vector_td<float,D>( matrix_size_os );
+    
+    if( !accumulate ){
+      clear(image);
+    }
+    
+    //
+    // Invoke kernel
+    //
+    
+    for( unsigned int repetition = 0; repetition<num_repetitions; repetition++ ){
+      
+      NFFT_H_atomic_convolve_kernel<float,D>
+        <<<dimGrid, dimBlock, (repetition==num_repetitions-1) ? dimBlock.x*bytes_per_thread_tail : dimBlock.x*bytes_per_thread>>>
+        ( alpha, beta, W, vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), number_of_samples,
+          (repetition==num_repetitions-1) ? domain_size_coils_tail : domain_size_coils,
+          raw_pointer_cast(&(*trajectory_positions)[0]), 
+          samples->get_data_ptr()+repetition*number_of_samples*number_of_frames*domain_size_coils,
+          image->get_data_ptr()+repetition*prod(matrix_size_os)*number_of_frames*domain_size_coils,
+          double_warp_size_power, float(0.5)*W, float(1)/(W), matrix_size_os_real );
+    }
+    
+    CHECK_FOR_CUDA_ERROR();
+   
+    return true;
+  }
+};
+
+template<unsigned int D> struct
+_convolve_NFFT_NC2C<double,D,true>{ // True: use atomic operations variant
+  // Atomics don't exist for doubles, so this gives a compile error if you actually try to use it.
+};
+
+template<class REAL, unsigned int D> struct
+_convolve_NFFT_NC2C<REAL,D,false>{ // False: use non-atomic operations variant
+  static void apply( cuNFFT_plan<REAL,D,false> *plan,
+                     cuNDArray<complext<REAL> > *samples, 
+                     cuNDArray<complext<REAL> > *image, 
+                     bool accumulate )
+  {
+    // Bring in some variables from the plan
+    
+    unsigned int device = plan->device;
+    unsigned int number_of_frames = plan->number_of_frames;
+    unsigned int number_of_samples = plan->number_of_samples;
+    typename uint64d<D>::Type matrix_size_os = plan->matrix_size_os;
+    typename uint64d<D>::Type matrix_size_wrap = plan->matrix_size_wrap;
+    typename reald<REAL,D>::Type alpha = plan->alpha;
+    typename reald<REAL,D>::Type beta = plan->beta;
+    REAL W = plan->W;
+    thrust::device_vector< typename reald<REAL,D>::Type > *trajectory_positions = plan->trajectory_positions;    
+    thrust::device_vector<unsigned int> *tuples_last = plan->tuples_last;
+    thrust::device_vector<unsigned int> *bucket_begin = plan->bucket_begin;
+    thrust::device_vector<unsigned int> *bucket_end = plan->bucket_end;
+
+    // private method - no consistency check. We trust in ourselves.
+    // Check if warp_size is a power of two. We do some modulus tricks in the kernels that depend on this...
+    if( !((cudaDeviceManager::Instance()->warp_size(device) & (cudaDeviceManager::Instance()->warp_size(device)-1)) == 0 ) ){
+      throw cuda_error("cuNFFT: unsupported hardware (warpSize is not a power of two)");
+
+    }
+    unsigned int num_batches = 1;
+    for( unsigned int d=D; d<image->get_number_of_dimensions(); d++ )
+      num_batches *= image->get_size(d);
+    num_batches /= number_of_frames;
+    
+    //
+    // Setup grid and threads
+    //
+    
+    size_t threads_per_block;
+    unsigned int max_coils;
+    
+    threads_per_block = NFFT_THREADS_PER_KERNEL;
+    
+    if( cudaDeviceManager::Instance()->major_version(device) == 1 ){
+      max_coils = NFFT_MAX_COILS_COMPUTE_1x;
+    }
+    else{
+      max_coils = NFFT_MAX_COILS_COMPUTE_2x;
+    }
+    
+    // We can (only) convolve domain_size_coils batches per run due to shared memory issues. 
+    unsigned int domain_size_coils_desired = num_batches;
+    unsigned int num_repetitions = domain_size_coils_desired/max_coils + 
+      ( ((domain_size_coils_desired%max_coils)==0) ? 0 : 1 );
+    unsigned int domain_size_coils = (num_repetitions==1) ? domain_size_coils_desired : max_coils;
+    unsigned int domain_size_coils_tail = (num_repetitions==1) ? domain_size_coils_desired : domain_size_coils_desired - (num_repetitions-1)*domain_size_coils;
+    
+    // Block and Grid dimensions
+    dim3 dimBlock( (unsigned int)threads_per_block ); 
+    dim3 dimGrid( (prod(matrix_size_os+matrix_size_wrap)+dimBlock.x-1)/dimBlock.x, number_of_frames );
+    
+    // Calculate how much shared memory to use per thread
+    size_t bytes_per_thread = domain_size_coils * sizeof( vector_td<REAL,D> );
+    size_t bytes_per_thread_tail = domain_size_coils_tail * sizeof( vector_td<REAL,D> );
+    
+    unsigned int double_warp_size_power=0, __tmp = cudaDeviceManager::Instance()->warp_size(device)<<1;
+    while(__tmp!=1){
+      __tmp>>=1;
+      double_warp_size_power++;
+    }
+    
+    vector_td<REAL,D> matrix_size_os_real = vector_td<REAL,D>( matrix_size_os );
+    
+    // Define temporary image that includes a wrapping zone
+    cuNDArray<complext<REAL> > _tmp;
+    
+    vector<size_t> vec_dims = to_std_vector(matrix_size_os+matrix_size_wrap); 
+    if( number_of_frames > 1 )
+      vec_dims.push_back(number_of_frames);
+    if( num_batches > 1 ) 
+      vec_dims.push_back(num_batches);
+    
+    _tmp.create(&vec_dims);
+    
+    //
+    // Invoke kernel
+    //
+    
+    for( unsigned int repetition = 0; repetition<num_repetitions; repetition++ ){
+      
+      NFFT_H_convolve_kernel<REAL,D>
+        <<<dimGrid, dimBlock, (repetition==num_repetitions-1) ? dimBlock.x*bytes_per_thread_tail : dimBlock.x*bytes_per_thread>>>
+        ( alpha, beta, W, vector_td<unsigned int,D>(matrix_size_os+matrix_size_wrap), number_of_samples,
+          (repetition==num_repetitions-1) ? domain_size_coils_tail : domain_size_coils, 
+          raw_pointer_cast(&(*trajectory_positions)[0]), 
+          _tmp.get_data_ptr()+repetition*prod(matrix_size_os+matrix_size_wrap)*number_of_frames*domain_size_coils,
+          samples->get_data_ptr()+repetition*number_of_samples*number_of_frames*domain_size_coils, 
+          raw_pointer_cast(&(*tuples_last)[0]), raw_pointer_cast(&(*bucket_begin)[0]), raw_pointer_cast(&(*bucket_end)[0]),
+          double_warp_size_power, REAL(0.5)*W, REAL(1)/(W), matrix_size_os_real );
+    }
+    
+    CHECK_FOR_CUDA_ERROR();
+    
+    plan->image_wrap( &_tmp, image, accumulate );
+  };
+};
+
+// Image wrap kernels
+
+template<class REAL, unsigned int D> __global__ void
+image_wrap_kernel( typename uintd<D>::Type matrix_size_os, typename uintd<D>::Type matrix_size_wrap, bool accumulate,
+                   complext<REAL> *in, complext<REAL> *out )
+{
+  unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  const unsigned int num_elements_per_image_src = prod(matrix_size_os+matrix_size_wrap);
+  const unsigned int image_offset_src = blockIdx.y*num_elements_per_image_src;
+  
+  const typename uintd<D>::Type co = idx_to_co<D>(idx, matrix_size_os);
+  const typename uintd<D>::Type half_wrap = matrix_size_wrap>>1;
+  
+  // Make "boolean" vectors denoting whether wrapping needs to be performed in a given direction (forwards/backwards)
+  vector_td<bool,D> B_l = vector_less( co, half_wrap );
+  vector_td<bool,D> B_r = vector_greater_equal( co, matrix_size_os-half_wrap );
+  
+  complext<REAL>  result = in[co_to_idx<D>(co+half_wrap, matrix_size_os+matrix_size_wrap) + image_offset_src];
+
+  if( sum(B_l+B_r) > 0 ){
+    
+    // Fold back the wrapping zone onto the image ("periodically")
+    //
+    // There is 2^D-1 ways to pick combinations of dimensions in D-dimensionsal space, e.g. 
+    // 
+    //  { x, y, xy } in 2D
+    //  { x, y, x, xy, xz, yz, xyz } in 3D
+    //
+    // Every "letter" in each combination provides two possible wraps (eiher end of the dimension)
+    // 
+    // For every 2^D-1 combinations DO
+    //   - find the number of dimensions, d, in the combination
+    //   - create 2^(d) stride vectors and test for wrapping using the 'B'-vectors above.
+    //   - accumulate the contributions
+    // 
+    //   The following code represents dimensions as bits in a char.
+    //
+    
+    for( unsigned char combination = 1; combination < (1<<D); combination++ ){
+    
+      // Find d
+      unsigned char d = 0;
+      for( unsigned char i=0; i<D; i++ )
+        d += ((combination & (1<<i)) > 0 );
+       
+      // Create stride vector for each wrapping test
+      for( unsigned char s = 0; s < (1<<d); s++ ){
+        
+        // Target for stride
+        typename intd<D>::Type stride;
+        char wrap_requests = 0;
+        char skipped_dims = 0;
+	
+        // Fill dimensions of the stride
+        for( unsigned char i=1; i<D+1; i++ ){
+    
+          // Is the stride dimension present in the current combination?
+          if( i & combination ){
+    
+            // A zero bit in s indicates "check for left wrap" and a one bit is interpreted as "check for right wrap" 
+            // ("left/right" for the individual dimension meaning wrapping on either side of the dimension).
+    
+            if( i & (s<<(skipped_dims)) ){
+              if( B_r.vec[i-1] ){ // Wrapping required 
+                set( stride, (size_t)(i-1), (int)(-1) );
+                wrap_requests++;
+              }
+              else
+                set( stride, i-1, (int)0 );
+            }
+            else{ 
+              if( B_l.vec[i-1] ){ // Wrapping required 
+                set( stride, i-1, (int)1 );
+                wrap_requests++;
+              }
+              else
+                set( stride, i-1, (int)0 );
+            }
+          }
+          else{
+            // Do not test for wrapping in dimension 'i-1' (for this combination)
+            set( stride, i-1, (int)0 );
+            skipped_dims++;
+          }
+        }
+	
+        // Now it is time to do the actual wrapping (if needed)
+        if( wrap_requests == d ){
+          typename intd<D>::Type src_co_int = vector_td<int,D>(co+half_wrap);
+          typename intd<D>::Type matrix_size_os_int = vector_td<int,D>(matrix_size_os);
+          typename intd<D>::Type co_offset_int = src_co_int + component_wise_mul<int,D>(stride,matrix_size_os_int);
+          typename uintd<D>::Type co_offset = vector_td<unsigned int,D>(co_offset_int);
+          result += in[co_to_idx<D>(co_offset, matrix_size_os+matrix_size_wrap) + image_offset_src];
+          break; // only one stride per combination can contribute (e.g. one edge, one corner)
+        } 
+      } 
+    }
+  }
+  
+  // Output
+  const unsigned int image_offset_tgt = blockIdx.y*prod(matrix_size_os);
+  if( accumulate ) result += out[idx+image_offset_tgt];
+  out[idx+image_offset_tgt] = result;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> void
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::image_wrap( cuNDArray<complext<REAL> > *source, cuNDArray<complext<REAL> > *target, bool accumulate )
+{
+  unsigned int num_batches = 1;
+  for( unsigned int d=D; d<source->get_number_of_dimensions(); d++ )
+    num_batches *= source->get_size(d);
+  num_batches /= number_of_frames;
+
+  // Set dimensions of grid/blocks.
+  unsigned int bdim = 256;
+  dim3 dimBlock( bdim );
+  dim3 dimGrid( prod(matrix_size_os)/bdim, number_of_frames*num_batches );
+
+  // Safety check
+  if( (prod(matrix_size_os)%bdim) != 0 ) {
+  	std::stringstream ss;
+  	ss << "Error: cuNFFT : the number of oversampled image elements must be a multiplum of the block size: " << bdim;
+    throw std::runtime_error(ss.str());
+  }
+
+  // Invoke kernel
+  image_wrap_kernel<REAL,D><<<dimGrid, dimBlock>>>
+    ( vector_td<unsigned int,D>(matrix_size_os), vector_td<unsigned int,D>(matrix_size_wrap), accumulate, source->get_data_ptr(), target->get_data_ptr() );
+  
+  CHECK_FOR_CUDA_ERROR();
+}	
+
+template<class REAL, unsigned int D, bool ATOMICS> typename uint64d<D>::Type
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::get_matrix_size()
+{
+  return matrix_size;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> typename uint64d<D>::Type
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::get_matrix_size_os()
+{
+  return matrix_size_os;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> REAL 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::get_W()
+{
+  return W;
+}
+
+template<class REAL, unsigned int D, bool ATOMICS> unsigned int 
+Gadgetron::cuNFFT_plan<REAL,D,ATOMICS>::get_device()
+{
+  return device;
+}
+
+
+//
+// Template instantion
+//
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 1, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 1, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 1, false >;
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 2, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 2, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 2, false >;
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 3, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 3, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 3, false >;
+
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 4, true >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< float, 4, false >;
+template class EXPORTGPUNFFT Gadgetron::cuNFFT_plan< double, 4, false >;
diff --git a/toolboxes/nfft/gpu/cuNFFT.h b/toolboxes/nfft/gpu/cuNFFT.h
new file mode 100644
index 0000000..2c400a0
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFT.h
@@ -0,0 +1,284 @@
+/** \file cuNFFT.h
+    \brief Cuda implementation of the non-Cartesian FFT
+
+    Reference information on the CUDA/GPU implementation of the NFFT can be found in the papers
+    
+    Accelerating the Non-equispaced Fast Fourier Transform on Commodity Graphics Hardware.
+    T.S. Sørensen, T. Schaeffter, K.Ø. Noe, M.S. Hansen. 
+    IEEE Transactions on Medical Imaging 2008; 27(4):538-547.
+    
+    Real-time Reconstruction of Sensitivity Encoded Radial Magnetic Resonance Imaging Using a Graphics Processing Unit.
+    T.S. Sørensen, D. Atkinson, T. Schaeffter, M.S. Hansen.
+    IEEE Transactions on Medical Imaging 2009; 28(12):1974-1985. 
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "vector_td.h"
+#include "complext.h"
+#include "gpunfft_export.h"
+
+#include <thrust/device_vector.h>
+#include <boost/shared_ptr.hpp>
+
+template<class REAL, unsigned int D, bool ATOMICS> struct _convolve_NFFT_NC2C;
+
+namespace Gadgetron{
+
+  /** \class cuNFFT_plan
+      \brief Cuda implementation of the non-Cartesian FFT
+
+      ------------------------------
+      --- NFFT class declaration ---
+      ------------------------------      
+      REAL:  desired precision : float or double
+      D:  dimensionality : { 1,2,3,4 }
+      ATOMICS: use atomic device memory transactions : { true, false }
+      
+      For the tested hardware the implementation using atomic operations is slower as its non-atomic counterpart.
+      However, using atomic operations has the advantage of not requiring any pre-processing.
+      As the preprocessing step can be quite costly in terms of memory usage,
+      the atomic mode can be necessary for very large images or for 3D/4D volumes.
+      Notice: currently no devices support atomics operations in double precision.
+  */
+  template< class REAL, unsigned int D, bool ATOMICS = false > class EXPORTGPUNFFT cuNFFT_plan
+  {
+  
+  public: // Main interface
+    
+    /** 
+	Default constructor
+    */
+    cuNFFT_plan();
+
+    /**
+       Constructor defining the required NFFT parameters.
+       \param matrix_size the matrix size to use for the NFFT. Define as a multiple of 32.
+       \param matrix_size_os intermediate oversampled matrix size. Define as a multiple of 32.
+       The ratio between matrix_size_os and matrix_size define the oversampling ratio for the NFFT implementation.
+       Use an oversampling ratio between 1 and 2. The higher ratio the better quality results, 
+       however at the cost of increased execution times. 
+       \param W the concolution window size used in the NFFT implementation. 
+       The larger W the better quality at the cost of increased runtime.
+       \param device the device (GPU id) to use for the NFFT computation. 
+       The default value of -1 indicates that the currently active device is used.
+     */
+    cuNFFT_plan( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os,
+	       REAL W, int device = -1 );
+
+    /**
+       Destructor
+    */
+    virtual ~cuNFFT_plan();
+
+    /** 
+	Enum to specify the desired mode for cleaning up when using the wipe() method.
+    */
+    enum NFFT_wipe_mode { 
+      NFFT_WIPE_ALL, /**< delete all internal memory. */
+      NFFT_WIPE_PREPROCESSING /**< delete internal memory holding the preprocessing data structures. */
+    };
+
+    /** 
+	Clear internal storage
+	\param mode enum defining the wipe mode
+    */
+    void wipe( NFFT_wipe_mode mode );
+
+    /** 
+	Setup the plan. Please see the constructor taking similar arguments for a parameter description.
+    */
+    void setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os,
+		REAL W, int device = -1 );
+
+    /**
+      Enum to specify the preprocessing mode.
+    */
+    enum NFFT_prep_mode { 
+      NFFT_PREP_C2NC, /**< preprocess to perform a Cartesian to non-Cartesian NFFT. */
+      NFFT_PREP_NC2C, /**< preprocess to perform a non-Cartesian to Cartesian NFFT. */
+      NFFT_PREP_ALL /**< preprocess to perform NFFTs in both directions. */
+    };
+
+    /**
+       Perform NFFT preprocessing for a given trajectory.
+       \param trajectory the NFFT non-Cartesian trajectory normalized to the range [-1/2;1/2]. 
+       \param mode enum specifying the preprocessing mode
+    */
+    void preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory, NFFT_prep_mode mode );
+
+    /**
+       Enum defining the desired NFFT operation
+    */
+    enum NFFT_comp_mode { 
+      NFFT_FORWARDS_C2NC, /**< forwards NFFT Cartesian to non-Cartesian. */
+      NFFT_FORWARDS_NC2C, /**< forwards NFFT non-Cartesian to Cartesian. */
+      NFFT_BACKWARDS_C2NC, /**< backwards NFFT Cartesian to non-Cartesian. */
+      NFFT_BACKWARDS_NC2C /**< backwards NFFT non-Cartesian to Cartesian. */
+    };
+
+    /**
+      Execute the NFFT.
+      \param[in] in the input array.
+      \param[out] out the output array.
+      \param[in] dcw optional density compensation weights weighing the input samples according to the sampling density. 
+      If an 0x0-pointer is provided no density compensation is used.
+      \param mode enum specifying the mode of operation.
+    */
+    void compute( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+		  cuNDArray<REAL> *dcw, NFFT_comp_mode mode );
+
+    /**
+      Execute an NFFT iteraion (from Cartesian image space to non-Cartesian Fourier space and back to Cartesian image space).
+      \param[in] in the input array.
+      \param[out] out the output array.
+      \param[in] dcw optional density compensation weights weighing the input samples according to the sampling density. 
+      If an 0x0-pointer is provided no density compensation is used.
+      \param[in] halfway_dims specifies the dimensions of the intermediate Fourier space (codomain).
+    */
+    void mult_MH_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out,
+		    cuNDArray<REAL> *dcw, std::vector<size_t> halfway_dims );
+  
+  public: // Utilities
+  
+    /**
+      Enum specifying the direction of the NFFT standalone convolution
+    */
+    enum NFFT_conv_mode { 
+      NFFT_CONV_C2NC, /**< convolution: Cartesian to non-Cartesian. */
+      NFFT_CONV_NC2C /**< convolution: non-Cartesian to Cartesian. */
+    };
+    
+    /**
+       Perform "standalone" convolution
+       \param[in] in the input array.
+       \param[out] out the output array.
+       \param[in] dcw optional density compensation weights.
+       \param[in] mode enum specifying the mode of the convolution
+       \param[in] accumulate specifies whether the result is added to the output (accumulation) or if the output is overwritten.
+    */
+    void convolve( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, cuNDArray<REAL> *dcw,
+		   NFFT_conv_mode mode, bool accumulate = false );
+    
+    /**
+       Enum specifying the direction of the NFFT standalone FFT.
+    */
+    enum NFFT_fft_mode { 
+      NFFT_FORWARDS, /**< forwards FFT. */
+      NFFT_BACKWARDS /**< backwards FFT. */
+    };
+
+    /**
+       Cartesian FFT. For completeness, just invokes the cuNDFFT class.
+       \param[in,out] data the data for the inplace FFT.
+       \param mode enum specifying the direction of the FFT.
+       \param do_scale boolean specifying whether FFT normalization is desired.
+    */
+    void fft( cuNDArray<complext<REAL> > *data, NFFT_fft_mode mode, bool do_scale = true );
+  
+    /**
+       NFFT deapodization.
+       \param[in,out] image the image to be deapodized (inplace).
+    */
+    void deapodize( cuNDArray<complext<REAL> > *image );
+
+  public: // Setup queries
+    
+    /**
+       Get the matrix size.
+    */
+    typename uint64d<D>::Type get_matrix_size();
+
+    /**
+       Get the oversampled matrix size.
+    */
+    typename uint64d<D>::Type get_matrix_size_os();
+
+    /**
+       Get the convolution kernel size
+    */
+    REAL get_W();
+    
+    /**
+       Get the assigned device id
+    */
+    unsigned int get_device();
+    
+  public: 
+
+    // Custom operators new/delete for windows memory handling across dll boundaries
+    void* operator new (size_t bytes) { return ::new char[bytes]; }
+    void operator delete (void *ptr) { delete [] static_cast <char *> (ptr); } 
+    void * operator new(size_t s, void * p) { return p; }
+
+    friend struct _convolve_NFFT_NC2C<REAL,D,ATOMICS>;
+  
+  private: // Internal to the implementation
+
+    // Validate setup / arguments
+    enum NFFT_components { _NFFT_CONV_C2NC = 1, _NFFT_CONV_NC2C = 2, _NFFT_FFT = 4, _NFFT_DEAPODIZATION = 8 };
+    void check_consistency( cuNDArray<complext<REAL> > *samples, cuNDArray<complext<REAL> > *image,
+			    cuNDArray<REAL> *dcw, unsigned char components );
+
+    // Shared barebones constructor
+    void barebones();
+    
+    // Compute beta control parameter for Kaiser-Bessel kernel
+    void compute_beta();
+
+    // Compute deapodization filter
+    void compute_deapodization_filter();
+
+    // Dedicated computes
+    void compute_NFFT_C2NC( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+    void compute_NFFT_NC2C( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+    void compute_NFFTH_NC2C( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+    void compute_NFFTH_C2NC( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out );
+
+    // Dedicated convolutions
+    void convolve_NFFT_C2NC( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate );
+    void convolve_NFFT_NC2C( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate );
+  
+    // Internal utility
+    void image_wrap( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate );
+
+  private:
+    
+    typename uint64d<D>::Type matrix_size;          // Matrix size
+    typename uint64d<D>::Type matrix_size_os;       // Oversampled matrix size
+    typename uint64d<D>::Type matrix_size_wrap;     // Wrap size at border
+
+    typename reald<REAL,D>::Type alpha;           // Oversampling factor (for each dimension)
+    typename reald<REAL,D>::Type beta;            // Kaiser-Bessel convolution kernel control parameter
+
+    REAL W;                                       // Kernel width in oversampled grid
+
+    unsigned int number_of_samples;               // Number of samples per frame per coil
+    unsigned int number_of_frames;                // Number of frames per reconstruction
+    
+    int device;                                   // Associated device id
+
+    //
+    // Internal data structures for convolution and deapodization
+    //
+
+    boost::shared_ptr< cuNDArray<complext<REAL> > > deapodization_filter;
+   
+    thrust::device_vector< typename reald<REAL,D>::Type > *trajectory_positions;
+    thrust::device_vector<unsigned int> *tuples_last;
+    thrust::device_vector<unsigned int> *bucket_begin, *bucket_end;
+
+    //
+    // State variables
+    //
+
+    bool preprocessed_C2NC, preprocessed_NC2C;
+    bool initialized;
+  };
+
+  // Pure virtual class to cause compile errors if you try to use NFFT with double and atomics
+  // - since this is not supported on the device
+  template< unsigned int D> class EXPORTGPUNFFT cuNFFT_plan<double,D,true>{ 
+    virtual void atomics_not_supported_for_type_double() = 0; };
+}
diff --git a/toolboxes/nfft/gpu/cuNFFTOperator.cu b/toolboxes/nfft/gpu/cuNFFTOperator.cu
new file mode 100644
index 0000000..60a8db1
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFTOperator.cu
@@ -0,0 +1,113 @@
+#include "cuNFFTOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D> void
+  cuNFFTOperator<REAL,D>::mult_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate )
+  {
+    if( !in || !out ){
+      throw std::runtime_error("cuNFFTOperator::mult_M : 0x0 input/output not accepted");
+    }
+
+    cuNDArray<complext<REAL> > *tmp_out;
+
+    if( accumulate ){
+      tmp_out = new cuNDArray<complext<REAL> >(out->get_dimensions());
+    }
+    else{
+      tmp_out = out;
+    }
+  
+    plan_->compute( in, tmp_out, dcw_.get(), cuNFFT_plan<REAL,D>::NFFT_FORWARDS_C2NC );
+
+    if( accumulate ){
+      *out += *tmp_out;
+      delete tmp_out;
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  cuNFFTOperator<REAL,D>::mult_MH( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate )
+  {
+    if( !in || !out ){
+      throw std::runtime_error("cuNFFTOperator::mult_MH : 0x0 input/output not accepted");
+    }
+
+    cuNDArray<complext<REAL> > *tmp_out;
+
+    if( accumulate ){
+      tmp_out = new cuNDArray<complext<REAL> >(out->get_dimensions());
+    }
+    else{
+      tmp_out = out;
+    }
+
+    plan_->compute( in, tmp_out, dcw_.get(), cuNFFT_plan<REAL,D>::NFFT_BACKWARDS_NC2C );
+    if( accumulate ){
+      *out += *tmp_out;
+      delete tmp_out;
+    }
+  }
+
+  template<class REAL, unsigned int D> void
+  cuNFFTOperator<REAL,D>::mult_MH_M( cuNDArray<complext<REAL> > *in, cuNDArray<complext<REAL> > *out, bool accumulate )
+  {
+    if( !in || !out ){
+      throw std::runtime_error("cuNFFTOperator::mult_MH_M : 0x0 input/output not accepted");
+    }
+    
+    boost::shared_ptr< std::vector<size_t> > codomain_dims = this->get_codomain_dimensions();
+    if( codomain_dims.get() == 0x0 || codomain_dims->size() == 0 ){
+      throw std::runtime_error("cuNFFTOperator::mult_MH_M : operator codomain dimensions not set");
+    }
+
+    cuNDArray<complext<REAL> > *tmp_out;
+    
+    if( accumulate ){
+      tmp_out = new cuNDArray<complext<REAL> >(out->get_dimensions());
+    }
+    else{
+      tmp_out = out;
+    }
+    
+    plan_->mult_MH_M( in, tmp_out, dcw_.get(), *codomain_dims );
+    
+    if( accumulate ){
+      *out += *tmp_out;
+      delete tmp_out;
+    } 
+  }
+  
+  template<class REAL, unsigned int D> void
+  cuNFFTOperator<REAL,D>::setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W )
+  {  
+    plan_->setup( matrix_size, matrix_size_os, W );  
+  }
+
+  template<class REAL, unsigned int D> void
+  cuNFFTOperator<REAL,D>::preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory ) 
+  {
+    if( trajectory == 0x0 ){
+      throw std::runtime_error("cuNFFTOperator::preprocess : 0x0 trajectory provided.");
+    }
+    
+    plan_->preprocess( trajectory, cuNFFT_plan<REAL,D>::NFFT_PREP_ALL );
+  }
+  
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUNFFT cuNFFTOperator<float,1>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,2>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,3>;
+  template class EXPORTGPUNFFT cuNFFTOperator<float,4>;
+
+  template class EXPORTGPUNFFT cuNFFTOperator<double,1>;
+  template class EXPORTGPUNFFT cuNFFTOperator<double,2>;
+  template class EXPORTGPUNFFT cuNFFTOperator<double,3>;
+  template class EXPORTGPUNFFT cuNFFTOperator<double,4>;
+}
diff --git a/toolboxes/nfft/gpu/cuNFFTOperator.h b/toolboxes/nfft/gpu/cuNFFTOperator.h
new file mode 100644
index 0000000..f01e31c
--- /dev/null
+++ b/toolboxes/nfft/gpu/cuNFFTOperator.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "linearOperator.h"
+#include "cuNFFT.h"
+#include "gpunfft_export.h"
+
+namespace Gadgetron{
+
+  template<class REAL, unsigned int D> class EXPORTGPUNFFT cuNFFTOperator : public linearOperator<cuNDArray< complext<REAL> > >
+  {  
+  public:
+  
+    cuNFFTOperator() : linearOperator<cuNDArray< complext<REAL> > >() {
+      plan_ = boost::shared_ptr< cuNFFT_plan<REAL, D> >( new cuNFFT_plan<REAL, D>() );
+    }
+  
+    virtual ~cuNFFTOperator() {}
+  
+    virtual void set_dcw( boost::shared_ptr< cuNDArray<REAL> > dcw ) { dcw_ = dcw; }
+    inline boost::shared_ptr< cuNDArray<REAL> > get_dcw() { return dcw_; }
+
+    inline boost::shared_ptr< cuNFFT_plan<REAL, D> > get_plan() { return plan_; }
+  
+    virtual void setup( typename uint64d<D>::Type matrix_size, typename uint64d<D>::Type matrix_size_os, REAL W );
+    virtual void preprocess( cuNDArray<typename reald<REAL,D>::Type> *trajectory );
+
+    virtual void mult_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false );
+    virtual void mult_MH( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false );
+    virtual void mult_MH_M( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out, bool accumulate = false );
+
+    virtual boost::shared_ptr< linearOperator< cuNDArray< complext<REAL>  > > > clone(){
+      return linearOperator< cuNDArray<complext<REAL> > >::clone(this);
+    }
+
+  protected:
+    boost::shared_ptr< cuNFFT_plan<REAL, D> > plan_;
+    boost::shared_ptr< cuNDArray<REAL> > dcw_;
+  };
+}
diff --git a/toolboxes/nfft/gpu/gpunfft_export.h b/toolboxes/nfft/gpu/gpunfft_export.h
new file mode 100644
index 0000000..28f9752
--- /dev/null
+++ b/toolboxes/nfft/gpu/gpunfft_export.h
@@ -0,0 +1,19 @@
+/** \file gpunfft_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUNFFT_EXPORT_H_
+#define GPUNFFT_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUNFFT__) || defined (gpunfft_EXPORTS)
+#define EXPORTGPUNFFT __declspec(dllexport)
+#else
+#define EXPORTGPUNFFT __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUNFFT
+#endif
+
+
+#endif /* GPUNFFT_EXPORT_H_ */
diff --git a/toolboxes/operators/CMakeLists.txt b/toolboxes/operators/CMakeLists.txt
new file mode 100644
index 0000000..b90e16b
--- /dev/null
+++ b/toolboxes/operators/CMakeLists.txt
@@ -0,0 +1,31 @@
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  )
+
+install(FILES 	
+  generalOperator.h
+  linearOperator.h
+  identityOperator.h
+  diagonalOperator.h
+  encodingOperatorContainer.h
+  multiplicationOperatorContainer.h
+  FFTOperator.h
+  imageOperator.h
+  encodedImageOperator.h
+  partialDerivativeOperator.h
+  convolutionOperator.h
+  laplaceOperator.h
+  downsampleOperator.h
+  upsampleOperator.h
+  tvPicsOperator.h
+  DESTINATION include)
+
+IF (ARMADILLO_FOUND)
+  add_subdirectory(cpu)
+ENDIF (ARMADILLO_FOUND)
+
+IF (CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/operators/FFTOperator.h b/toolboxes/operators/FFTOperator.h
new file mode 100644
index 0000000..4760b72
--- /dev/null
+++ b/toolboxes/operators/FFTOperator.h
@@ -0,0 +1,70 @@
+/** \file FFTOperator.h
+    \brief Device independent implementation of the FFT operator.
+
+    The file FFTOperator.h is a device independent implementation of an operator perfoming a Cartesian FFT.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoFFTOperator(/.h) for a cpu instantiated operator using the hoNDArray class
+    - the class(/file) cuFFTOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE, class FFT> class FFTOperator : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+    
+    FFTOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~FFTOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( in == 0x0 || out == 0x0 ){
+	throw std::runtime_error("Error: FFTOperator::mult_M(): illegal array pointer provided");
+      }
+      
+      if( accumulate ){
+	ARRAY_TYPE tmp(in);
+	FFT::instance()->fft(&tmp);
+    	*out += tmp;
+      }
+      else{
+	*out = *in;
+	FFT::instance()->fft(out);
+      }
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( in == 0x0 || out == 0x0 ){
+	throw std::runtime_error("Error: FFTOperator::mult_M(): illegal array pointer provided");
+      }
+      
+      if( accumulate ){
+	ARRAY_TYPE tmp(in);
+	FFT::instance()->ifft(&tmp);
+    	*out += tmp;
+      }
+      else{
+	*out = *in;
+	FFT::instance()->ifft(out);
+      }
+    }
+    
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate )
+    	*out += *in;
+      else 
+	*out = *in;           
+    }
+    
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }
+  };
+}
diff --git a/toolboxes/operators/convolutionOperator.h b/toolboxes/operators/convolutionOperator.h
new file mode 100644
index 0000000..a81cc96
--- /dev/null
+++ b/toolboxes/operators/convolutionOperator.h
@@ -0,0 +1,220 @@
+/** \file convolutionOperator.h
+    \brief Base class for all convolution operators.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td_utilities.h"
+
+#include <boost/smart_ptr.hpp>
+#include <vector>
+
+namespace Gadgetron{
+
+  template <class COMPLEX_ARRAY_TYPE, unsigned int D> class convolutionOperator : public linearOperator<COMPLEX_ARRAY_TYPE>
+  {  
+  protected:
+    typedef typename COMPLEX_ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+    
+  public:
+    
+    convolutionOperator() : linearOperator<COMPLEX_ARRAY_TYPE>() {}
+    virtual ~convolutionOperator() {}
+    
+    // Set the convolution kernel
+    virtual void set_kernel( COMPLEX_ARRAY_TYPE *image_space_kernel )
+    {     
+      if (!image_space_kernel) throw std::runtime_error("convolutionOperator: null pointer kernel provided");
+      COMPLEX_ARRAY_TYPE *freq_kernel = new COMPLEX_ARRAY_TYPE(*image_space_kernel);
+      operator_fft( true, freq_kernel );
+      kernel_ = boost::shared_ptr<COMPLEX_ARRAY_TYPE>(freq_kernel);
+      
+      COMPLEX_ARRAY_TYPE *freq_kernel_adjoint = new COMPLEX_ARRAY_TYPE(freq_kernel->get_dimensions());      
+      origin_mirror( freq_kernel, freq_kernel_adjoint );
+      adjoint_kernel_ = boost::shared_ptr<COMPLEX_ARRAY_TYPE>(freq_kernel_adjoint);           
+    }
+    
+    // Apply image operators
+    //
+    
+    virtual void mult_MH_M( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out, bool accumulate = false )
+    {    
+      if( !kernel_.get() ){
+	throw std::runtime_error( "convolutionOperator::mult_MH_M failed : kernel is not set");
+      }
+    
+      if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+    	throw std::runtime_error( "convolutionOperator::mult_MH_M failed : in/out image dimensions mismatch");
+      }
+      
+      bool use_oversampling;
+      if( in->get_number_of_elements() == kernel_->get_number_of_elements() )
+	use_oversampling = false;
+      else if( (in->get_number_of_elements()<<D) == kernel_->get_number_of_elements() )
+	use_oversampling = true;
+      else{
+	throw std::runtime_error( "convolutionOperator::mult_MH_M failed : in/out image dimensions mismatch the kernel");
+      }
+      
+      // Intermediate variables
+      COMPLEX_ARRAY_TYPE *tmp_out;
+
+      if( use_oversampling ){
+	boost::shared_ptr< std::vector<size_t> > osdims = kernel_->get_dimensions();
+	tmp_out = new COMPLEX_ARRAY_TYPE(osdims);
+	pad<ELEMENT_TYPE,D>( in, tmp_out );
+      }
+      else if( accumulate ){
+	tmp_out = new COMPLEX_ARRAY_TYPE(*in);
+      }
+      else{ 
+	*out = *in;
+	tmp_out = out;
+      } 
+
+      // Forwards fft
+      operator_fft( true, tmp_out );
+
+      // Multiply
+      *tmp_out *= *kernel_;
+      *tmp_out *= *adjoint_kernel_;
+
+      // Inverse fft
+      operator_fft( false, tmp_out );
+
+      if( use_oversampling ) {
+	operator_crop( tmp_out, out );
+	delete tmp_out;
+      }    
+      else if( accumulate ){
+    	*out += *tmp_out;
+	delete tmp_out;
+      }    
+    }
+    
+  
+    virtual void mult_M( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( !kernel_.get() ){
+    	throw std::runtime_error("convolutionOperator::mult_M failed : kernel is not set");
+      }
+    
+      if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+    	throw std::runtime_error( "convolutionOperator::mult_M failed : in/out image dimensions mismatch");
+      }
+
+      bool use_oversampling;
+      if( in->get_number_of_elements() == kernel_->get_number_of_elements() )
+	use_oversampling = false;
+      else if( (in->get_number_of_elements()<<D) == kernel_->get_number_of_elements() )
+	use_oversampling = true;
+      else{
+	throw std::runtime_error( "convolutionOperator::mult_M failed : in/out image dimensions mismatch the kernel");
+      }
+    
+      // Intermediate variables
+      COMPLEX_ARRAY_TYPE *tmp_out;
+
+      if( use_oversampling ){
+	boost::shared_ptr< std::vector<size_t> > osdims = kernel_->get_dimensions();
+	tmp_out = new COMPLEX_ARRAY_TYPE(osdims);
+	pad<ELEMENT_TYPE,D>( in, tmp_out );
+      }
+      else if( accumulate ){
+	tmp_out = new COMPLEX_ARRAY_TYPE(*in);
+      }
+      else{ 
+	*out = *in;
+	tmp_out = out;
+      } 
+
+      // Forwards fft
+      operator_fft( true, tmp_out );
+
+      // Multiply
+      *tmp_out *= *kernel_;
+ 
+      // Inverse fft
+      operator_fft( false, tmp_out );
+
+      if( use_oversampling ) {
+	operator_crop( tmp_out, out );
+	delete tmp_out;
+      }    
+      else if( accumulate ){
+    	*out += *tmp_out;
+	delete tmp_out;
+      }    
+    }
+  
+    virtual void mult_MH( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( !adjoint_kernel_.get() ){
+	throw std::runtime_error("convolutionOperator::mult_MH failed : kernel is not set");
+      }
+      
+      if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+	throw std::runtime_error("convolutionOperator::mult_MH failed : in/out image dimensions mismatch");
+      }
+
+      bool use_oversampling;
+      if( in->get_number_of_elements() == adjoint_kernel_->get_number_of_elements() )
+	use_oversampling = false;
+      else if( (in->get_number_of_elements()<<D) == adjoint_kernel_->get_number_of_elements() )
+	use_oversampling = true;
+      else{
+    	throw std::runtime_error( "convolutionOperator::mult_MH failed : in/out image dimensions mismatch the kernel");
+      }
+      
+      // Intermediate variables
+      COMPLEX_ARRAY_TYPE *tmp_out;
+
+      if( use_oversampling ){
+	boost::shared_ptr< std::vector<size_t> > osdims = adjoint_kernel_->get_dimensions();
+	tmp_out = new COMPLEX_ARRAY_TYPE(osdims);
+	pad<ELEMENT_TYPE,D>( in, tmp_out );
+      }
+      else if( accumulate ){
+	tmp_out = new COMPLEX_ARRAY_TYPE(*in);
+      }
+      else{ 
+	*out = *in;
+	tmp_out = out;
+      } 
+      
+      // Forwards fft
+      operator_fft( true, tmp_out );
+
+      // Multiply
+      *tmp_out *= *adjoint_kernel_;
+
+      // Inverse fft
+      operator_fft( false, tmp_out );
+
+      if( use_oversampling ) {
+	operator_crop( tmp_out, out );
+	delete tmp_out;
+      }    
+      else if( accumulate ){
+    	*out += *tmp_out;
+	delete tmp_out;
+      }
+    }
+
+  protected:
+  
+    virtual void operator_fft( bool forwards_transform, COMPLEX_ARRAY_TYPE *image ) = 0;    
+    virtual void origin_mirror( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out ) = 0;
+
+    virtual void operator_crop( COMPLEX_ARRAY_TYPE *in, COMPLEX_ARRAY_TYPE *out ){
+      typename uint64d<D>::Type offset = from_std_vector<size_t,D>(*(in->get_dimensions().get()))>>2;
+      crop<ELEMENT_TYPE,D>( offset, in, out );
+    }
+    
+  private:
+    boost::shared_ptr<COMPLEX_ARRAY_TYPE> kernel_;
+    boost::shared_ptr<COMPLEX_ARRAY_TYPE> adjoint_kernel_;
+  };
+}
diff --git a/toolboxes/operators/cpu/CMakeLists.txt b/toolboxes/operators/cpu/CMakeLists.txt
new file mode 100644
index 0000000..e8f00e3
--- /dev/null
+++ b/toolboxes/operators/cpu/CMakeLists.txt
@@ -0,0 +1,22 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUOPERATORS__)
+endif (WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  )
+
+#target_link_libraries(cpuoperators  
+#  )
+
+#install(TARGETS cpuoperators DESTINATION lib)
+
+install(FILES 	
+  hoIdentityOperator.h
+  hoImageOperator.h
+  hoFFTOperator.h
+  hoPartialDerivativeOperator.h
+  hoTvOperator.h
+  hoTvPicsOperator.h
+  DESTINATION include)
diff --git a/toolboxes/operators/cpu/hoFFTOperator.h b/toolboxes/operators/cpu/hoFFTOperator.h
new file mode 100644
index 0000000..716979c
--- /dev/null
+++ b/toolboxes/operators/cpu/hoFFTOperator.h
@@ -0,0 +1,29 @@
+/** \file hoFFTOperator.h
+    \brief Instantiation of the Cartesian FFT operator on the cpu.
+    
+    The file hoFFTOperator.h is a convienience wrapper for the device independent FFTOperator class.
+    The class hoFFTOperator instantiates the FFTOperator for the hoNDArray< std::complex<T> >
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "FFTOperator.h"
+#include "hoFFT.h"
+
+namespace Gadgetron{
+  
+  /** \class hoFFTOperator
+      \brief Instantiation of the Cartesian FFT operator on the cpu.
+      
+      The class hoFFTOperator is a convienience wrapper for the device independent FFTOperator.
+      It instantiates the FFTOperator for type hoNDArray< std::complex<T> >.
+  */
+  template <class T> class hoFFTOperator : public FFTOperator< hoNDArray< std::complex<T> >, hoFFT<T> >
+  {
+  public:    
+    hoFFTOperator() : FFTOperator< hoNDArray< std::complex<T> >, hoFFT<T> >() {}
+    virtual ~hoFFTOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/cpu/hoIdentityOperator.h b/toolboxes/operators/cpu/hoIdentityOperator.h
new file mode 100644
index 0000000..f9876c9
--- /dev/null
+++ b/toolboxes/operators/cpu/hoIdentityOperator.h
@@ -0,0 +1,28 @@
+/** \file hoIdentityOperator.h
+    \brief Instantiation of the identity operator on the cpu.
+    
+    The file hoIdentityOperator.h is a convienience wrapper for the device independent identityOperator class.
+    The class hoIdentityOperator instantiates the identityOperator for the hoNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "identityOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class hoIdentityOperator
+      \brief Instantiation of the identity operator on the cpu.
+      
+      The class hoIdentityOperator is a convienience wrapper for the device independent identityOperator.
+      hoIdentityOperator instantiates the identityOperator for type hoNDArray<T>.
+  */
+  template <class T> class hoIdentityOperator : public identityOperator< hoNDArray<T> >
+  {
+  public:    
+    hoIdentityOperator() : identityOperator< hoNDArray<T> >() {}
+    virtual ~hoIdentityOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/cpu/hoImageOperator.h b/toolboxes/operators/cpu/hoImageOperator.h
new file mode 100644
index 0000000..0a74746
--- /dev/null
+++ b/toolboxes/operators/cpu/hoImageOperator.h
@@ -0,0 +1,58 @@
+/** \file hoImageOperator.h
+    \brief Image regularization operator, CPU based.
+*/
+
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "complext.h"
+#include "imageOperator.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template <class T> class hoImageOperator : public imageOperator< hoNDArray<typename realType<T>::Type >, hoNDArray<T> >
+  {
+  public:
+
+    hoImageOperator() : imageOperator< hoNDArray<typename realType<T>::Type >, hoNDArray<T> >() {}
+    virtual ~hoImageOperator() {}    
+
+    typedef typename imageOperator< hoNDArray<typename realType<T>::Type>, hoNDArray<T> >::REAL REAL;
+
+    virtual boost::shared_ptr< linearOperator< hoNDArray<T> > > clone() {
+      return linearOperator< hoNDArray<T> >::clone(this);
+    }
+
+  protected:
+
+    // Estimate offset to the regularization image
+    virtual REAL estimate_offset()
+    {
+      // Estimation based on simple histogram analysis:
+      // Returns an estimation of the "average" intensity of the 'sigma' proportion of the image with the smallest intensities.
+      //
+      
+      const unsigned int granularity = 50000; 
+      std::vector<unsigned int> histogram(granularity,0);
+      REAL max_value = this->image_->at(amax(this->image_.get()));
+      REAL *d = this->image_->get_data_ptr();
+
+      for( unsigned int i=0; i<this->image_->get_number_of_elements(); i++) {
+	unsigned int bin = std::min(static_cast<unsigned int>(std::floor((d[i]/max_value)*granularity)), granularity-1);
+	histogram[bin]++;
+      }
+      
+      //Find 1th percentile
+      //
+
+      unsigned int cumsum = 0, counter = 0;
+      while (cumsum < (unsigned int)(REAL(0.01)*this->image_->get_number_of_elements())) {
+	cumsum += histogram[counter++];
+      }      
+      return REAL(counter+1)*max_value/granularity;
+    }
+  };
+}
diff --git a/toolboxes/operators/cpu/hoPartialDerivativeOperator.h b/toolboxes/operators/cpu/hoPartialDerivativeOperator.h
new file mode 100644
index 0000000..45bd8db
--- /dev/null
+++ b/toolboxes/operators/cpu/hoPartialDerivativeOperator.h
@@ -0,0 +1,110 @@
+/** \file hoPartialDerivativeOperator.h
+\brief Partial derivative regularization operator, CPU based.
+*/
+
+#pragma once
+
+#include "partialDerivativeOperator.h"
+#include "hoNDArray_math.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+    /** \class hoPartialDerivativeOperator
+    \brief CPU implementation of device dependent portions of the partialDerivative operator.
+    */
+    template <class T, unsigned int D> class hoPartialDerivativeOperator
+        : public partialDerivativeOperator<D, hoNDArray<T> >
+    {
+    public:
+
+        hoPartialDerivativeOperator() : 
+          partialDerivativeOperator< D, hoNDArray<T> >(0) {}
+
+          hoPartialDerivativeOperator( size_t dimension ) : 
+          partialDerivativeOperator<D, hoNDArray<T> >( dimension ) {}
+
+          virtual ~hoPartialDerivativeOperator() {}
+
+          virtual void compute_partial_derivative( typename int64d<D>::Type stride, hoNDArray<T> *in,
+              hoNDArray<T> *out, bool accumulate )
+          {
+              if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+                  throw std::runtime_error( "hoPartialDerivativeOperator::compute_partial_derivative : array dimensions mismatch.");
+              }
+
+              if( in->get_number_of_dimensions() != D || out->get_number_of_dimensions() != D ){
+                  throw std::runtime_error("hoPartialDerivativeOperator::compute_partial_derivative : dimensionality mismatch");
+              }
+
+              typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx=0; idx<in->get_number_of_elements(); idx++ ) {
+
+                  T valN, valC;
+
+                  typename int64d<D>::Type co = idx_to_co<D>(idx, dims);
+                  typename int64d<D>::Type coN = (co+dims+stride)%dims;
+
+                  valN = in->get_data_ptr()[co_to_idx<D>(coN, dims)];
+                  valC = in->get_data_ptr()[co_to_idx<D>(co, dims)];
+
+                  T val = valN-valC;
+
+                  if( accumulate )
+                      out->get_data_ptr()[idx] += val;
+                  else
+                      out->get_data_ptr()[idx] = val;
+              }
+          }
+
+          virtual void compute_second_order_partial_derivative( typename int64d<D>::Type forwards_stride,
+              typename int64d<D>::Type adjoint_stride, 
+              hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+          {
+              if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+                  throw std::runtime_error( "hoPartialDerivativeOperator::compute_second_order_partial_derivative : array dimensions mismatch.");
+              }
+
+              if( in->get_number_of_dimensions() != D || out->get_number_of_dimensions() != D ){
+                  throw std::runtime_error( "hoPartialDerivativeOperator::compute_second_order_partial_derivative : dimensionality mismatch");
+              }
+
+              typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx=0; idx<in->get_number_of_elements(); idx++ ) {
+
+                  T valN1, valN2, valC;
+
+                  typename int64d<D>::Type co = idx_to_co<D>(idx, dims);
+                  typename int64d<D>::Type coN1 = (co+dims+forwards_stride)%dims;
+                  typename int64d<D>::Type coN2 = (co+dims+adjoint_stride)%dims;
+
+                  valN1 = in->get_data_ptr()[co_to_idx<D>(coN1, dims)];
+                  valN2 = in->get_data_ptr()[co_to_idx<D>(coN2, dims)];
+                  valC = in->get_data_ptr()[co_to_idx<D>(co, dims)];
+
+                  T val = valC+valC-valN1-valN2;
+
+                  if( accumulate )
+                      out->get_data_ptr()[idx] += val;
+                  else
+                      out->get_data_ptr()[idx] = val;
+              }
+          }
+
+          virtual boost::shared_ptr< linearOperator< hoNDArray<T> > > clone() {
+              return linearOperator< hoNDArray<T> >::clone(this);
+          }    
+    };
+}
diff --git a/toolboxes/operators/cpu/hoTvOperator.h b/toolboxes/operators/cpu/hoTvOperator.h
new file mode 100644
index 0000000..6b2d609
--- /dev/null
+++ b/toolboxes/operators/cpu/hoTvOperator.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "generalOperator.h"
+
+#include "vector_td_operators.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+template<class T, unsigned int D> class hoTvOperator
+		: public generalOperator< hoNDArray<T> >
+{
+protected:
+	typedef typename realType<T>::Type REAL;
+
+public:
+	hoTvOperator() : generalOperator< hoNDArray<T> >(){
+		limit_ = REAL(1e-8);
+	}
+
+	virtual ~hoTvOperator() {}
+
+	void set_limit(REAL limit){
+		limit_ = limit;
+	}
+
+	virtual void gradient( hoNDArray<T> *in_array, hoNDArray<T> *out_array, bool accumulate=false )
+	{
+		if (in_array->get_number_of_elements() != out_array->get_number_of_elements()){
+			throw std::runtime_error("hoTvOperator: input/output array dimensions mismatch");
+		}
+
+		T* in = in_array->get_data_ptr();
+		T* out = out_array->get_data_ptr();
+
+		vector_td<unsigned int,D> dims = from_std_vector<unsigned int, D>(*(in_array->get_dimensions()));
+
+		if (!accumulate)
+			clear(out_array);
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+		for (int idx=0; idx < in_array->get_number_of_elements(); idx++){
+
+			T xi = in[idx];
+			T result = T(0);
+
+			vector_td<unsigned int,D> co = idx_to_co<D>(idx, dims);
+
+			REAL grad = gradient_(in,dims,co);
+
+			if (grad > limit_) {
+				result += REAL(D)*xi/grad;
+				for (int i = 0; i < D; i++){
+					co[i]+=1;
+					result -= in[co_to_idx<D>((co+dims)%dims,dims)]/grad;
+					co[i]-=1;
+				}
+			}
+
+			for (int i = 0; i < D; i++){
+				co[i]-=1;
+				grad = gradient_(in,dims,co);
+				if (grad > limit_) {
+					result +=(xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+				}
+				co[i]+=1;
+			}
+			out[idx] += this->weight_*result;
+		}
+	}
+
+
+	virtual REAL magnitude( hoNDArray<T> *in_array )
+	{
+
+		T* in = in_array->get_data_ptr();
+
+		vector_td<unsigned int,D> dims = from_std_vector<unsigned int, D>(*(in_array->get_dimensions()));
+
+		REAL result =0;
+#ifdef USE_OMP
+#pragma omp parallel for reduction(+:result)
+#endif
+		for (int idx=0; idx < in_array->get_number_of_elements(); idx++){
+			vector_td<unsigned int,D> co = idx_to_co<D>(idx, dims);
+			REAL grad = gradient_(in,dims,co);
+			result += this->weight_*grad;
+		}
+
+		return result;
+	}
+
+private:
+
+	REAL inline gradient_(T* in, const vector_td<unsigned int,D> dims, vector_td<unsigned int,D> co)
+	{
+		REAL grad = REAL(0);
+		T xi = in[co_to_idx<D>((co+dims)%dims,dims)];
+		for (int i = 0; i < D; i++){
+			co[i]+=1;
+			T dt = in[co_to_idx<D>((co+dims)%dims,dims)];
+			grad += norm(xi-dt);
+			co[i]-=1;
+		}
+		return std::sqrt(grad);
+	}
+
+protected:
+	REAL limit_;
+};
+}
diff --git a/toolboxes/operators/cpu/hoTvPicsOperator.h b/toolboxes/operators/cpu/hoTvPicsOperator.h
new file mode 100644
index 0000000..fd5fb6e
--- /dev/null
+++ b/toolboxes/operators/cpu/hoTvPicsOperator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "hoTvOperator.h"
+#include "tvPicsOperator.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class hoTvPicsOperator 
+    : public tvPicsOperator< hoNDArray<T>, hoTvOperator<T,D>, typename realType<T>::Type >
+  {
+  public:
+    hoTvPicsOperator() : tvPicsOperator< hoNDArray<T>, hoTvOperator<T,D>, typename realType<T>::Type >() {}
+    virtual ~hoTvPicsOperator() {}
+  };    
+}
diff --git a/toolboxes/operators/diagonalOperator.h b/toolboxes/operators/diagonalOperator.h
new file mode 100644
index 0000000..ffe6361
--- /dev/null
+++ b/toolboxes/operators/diagonalOperator.h
@@ -0,0 +1,78 @@
+/** \file laplaceOperator.h
+    \brief Base class for the diagonal matrix operators.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron {
+
+  template <class ARRAY_TYPE> class diagonalOperator : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+  
+    diagonalOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~diagonalOperator() {}
+  
+    // Set/get diagonal
+    //
+    
+    virtual void set_diagonal( boost::shared_ptr<ARRAY_TYPE> diagonal ) { 
+      diagonal_ = diagonal;
+      diagonal_conj_ = conj(diagonal.get());
+    }
+
+    virtual boost::shared_ptr<ARRAY_TYPE> get_diagonal() { return diagonal_; }
+  
+    // Apply diagonal operator (twice)
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate )
+    {    
+      if( accumulate ) {
+	ARRAY_TYPE tmp(*in);
+	tmp *= *diagonal_;
+	tmp *= *diagonal_conj_;
+	*out += tmp;
+      }
+      else{
+	*out = *in;
+	*out *= *diagonal_;
+	*out *= *diagonal_conj_;
+      }
+    }
+  
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ) {
+	ARRAY_TYPE tmp(*in);
+	tmp *= *diagonal_;
+	*out += tmp;
+      }
+      else{
+	*out = *in;
+	*out *= *diagonal_;
+      }
+    }
+  
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ) {
+	ARRAY_TYPE tmp(*in);
+	tmp *= *diagonal_conj_;
+	*out += tmp;
+      }
+      else{
+	*out = *in;
+	*out *= *diagonal_conj_;
+      }
+    }
+    
+    virtual boost::shared_ptr< linearOperator<ARRAY_TYPE> > clone() {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }
+  
+  protected:
+    boost::shared_ptr<ARRAY_TYPE> diagonal_;
+    boost::shared_ptr<ARRAY_TYPE> diagonal_conj_;
+  };
+}
diff --git a/toolboxes/operators/downsampleOperator.h b/toolboxes/operators/downsampleOperator.h
new file mode 100644
index 0000000..913864e
--- /dev/null
+++ b/toolboxes/operators/downsampleOperator.h
@@ -0,0 +1,51 @@
+/** \file DownsamplingOperator.h
+    \brief Base class for the downsampling operators.
+
+    For instantiation we refer to
+    - the class(/file) cuDownsamplingOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE, unsigned int D> class downsampleOperator
+    : public linearOperator<ARRAY_TYPE>
+  {
+    
+  public:
+
+    typedef typename ARRAY_TYPE::element_type T;
+
+    downsampleOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~downsampleOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = downsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        downsample<T,D>(in,out);
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = upsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        upsample<T,D>(in,out);
+    }
+
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }    
+  };
+}
diff --git a/toolboxes/operators/encodedImageOperator.h b/toolboxes/operators/encodedImageOperator.h
new file mode 100644
index 0000000..bcb48bb
--- /dev/null
+++ b/toolboxes/operators/encodedImageOperator.h
@@ -0,0 +1,48 @@
+/** \file encodedImageOperator.h
+    \brief Regularization operator for encoded images. Careful, only implements mult_MH_M and not (yet) mult_M and mult_MH.
+*/
+
+#pragma once
+
+#include "imageOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE_REAL, class ARRAY_TYPE_OPERATOR> class encodedImageOperator
+    : public imageOperator<ARRAY_TYPE_REAL, ARRAY_TYPE_OPERATOR>
+  {
+  
+  public:
+  
+    encodedImageOperator() : imageOperator<ARRAY_TYPE_REAL, ARRAY_TYPE_OPERATOR>() {}
+    virtual ~encodedImageOperator() {}
+ 
+    // Set encoding operator for the regularization image
+    virtual void set_encoding_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE_OPERATOR> > encoding_operator )
+
+    {
+      encoding_operator_ = encoding_operator;
+    }
+  
+    // Apply regularization image operator
+    virtual void mult_MH_M( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {    
+      if( !encoding_operator_.get() ){
+	throw std::runtime_error("encodedImageOperator::mult_MH_M failed : encoding operator not set");
+      }
+    
+      ARRAY_TYPE_OPERATOR tmp(in->get_dimensions());
+
+      encoding_operator_->mult_M( in, &tmp );
+ 
+      ARRAY_TYPE_OPERATOR tmp2(in->get_dimensions());
+
+      imageOperator<ARRAY_TYPE_REAL, ARRAY_TYPE_OPERATOR>::mult_MH_M( &tmp, &tmp2 );
+    
+      encoding_operator_->mult_MH( &tmp2, out, accumulate );
+    }  
+  
+  private:
+    boost::shared_ptr< linearOperator<ARRAY_TYPE_OPERATOR> > encoding_operator_;
+  };
+}
diff --git a/toolboxes/operators/encodingOperatorContainer.h b/toolboxes/operators/encodingOperatorContainer.h
new file mode 100644
index 0000000..1944eaf
--- /dev/null
+++ b/toolboxes/operators/encodingOperatorContainer.h
@@ -0,0 +1,235 @@
+/** \file encodingOperatorContainer.h
+    \brief An encoding operator that can contain multiple other encoding operators. Use it when more than one encoding operator is required in a solver.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+#include <iostream>
+#include <vector>
+#include <boost/smart_ptr.hpp>
+#include <sstream>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class encodingOperatorContainer : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+
+    encodingOperatorContainer() : linearOperator<ARRAY_TYPE>() { num_elements_ = 0; }
+    virtual ~encodingOperatorContainer(){}
+
+    // The domain and codomain dimensions of this container cannot be set. 
+    // They should be set indirectly through the contained operators instead.
+    //
+    virtual void set_domain_dimensions( std::vector<size_t>* ){
+      throw std::runtime_error( "Error: encodingOperatorContainer::set_domain_dimensions() : operation not supported." );
+    }
+    
+    virtual void set_codomain_dimensions( std::vector<size_t>* ){
+      throw std::runtime_error( "Error: encodingOperatorContainer::set_codomain_dimensions() : operation not supported." );
+    }
+    
+    // Get domain and codomain dimensions:
+    // The domain should match between the individual operators.
+    // The codomain is a concatenation of the indivudial operators' domains.
+    //
+    virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions() 
+    { 
+      if( operators_.size() == 0 ){
+	throw std::runtime_error( "Error: encodingOperatorContainer::get_domain_dimensions() : no operators present." );
+      }
+      
+      boost::shared_ptr< std::vector<size_t> > dims = (operators_[0])->get_domain_dimensions();
+      for( size_t i=1; i<operators_.size(); i++ )
+	if( *dims != *((operators_[i])->get_domain_dimensions()) ){
+	  throw std::runtime_error( "Error: encodingOperatorContainer::get_domain_dimensions() : inconsistent operator dimensions." );
+	}
+      return dims;
+    }
+    
+    virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions() 
+    { 
+      if( num_elements_ == 0 ){
+	throw std::runtime_error( "Error: encodingOperatorContainer::get_codomain_dimensions() : no operators present." );
+      }
+      
+      std::vector<size_t> *dims = new std::vector<size_t>();
+      dims->push_back(num_elements_);
+      return boost::shared_ptr< std::vector<size_t> >(dims);
+    }
+
+    // Get domain and codomain for the individual operators
+    //
+    virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions(size_t i) 
+    { 
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_domain_dimensions : illegal index provided");
+      return operators_[i]->get_domain_dimensions(); 
+    }
+  
+    virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(size_t i) 
+    { 
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_codomain_dimensions : illegal index provided");
+      return operators_[i]->get_codomain_dimensions(); 
+    }
+  
+    // Allocate an array of the codomain dimensions
+    //
+    boost::shared_ptr< ARRAY_TYPE> create_codomain() 
+    {
+      return boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(get_codomain_dimensions()));
+    }
+  
+    // Concatenate a vector of codomains into a single array
+    //
+    boost::shared_ptr< ARRAY_TYPE> create_codomain( std::vector<ARRAY_TYPE*> codoms )
+    {
+      if (codoms.size() != operators_.size())
+	throw std::runtime_error("encodingOperatorContainter::create_codomain: number of operators and number of codomains do no match");
+
+      boost::shared_ptr<ARRAY_TYPE> codomain(new ARRAY_TYPE(get_codomain_dimensions()));
+      size_t offset = 0;
+
+      for (size_t i = 0; i < operators_.size(); i++){
+
+	if (!codoms[i]->dimensions_equal(get_codomain_dimensions(i).get())){
+	  std::stringstream ss;
+	  ss << "encodingOperatorContainter::create_codomain: input codomain " << i << " does not match corresponding operator codomain" << std::endl;
+	  ss << "Input codomain: ";
+	  std::vector<size_t> ico = *codoms[i]->get_dimensions();
+	  for (size_t k = 0; k < ico.size(); k++) ss << ico[k] << " ";
+	  ss << std::endl;
+	  ss << "Operator codomain: ";
+	  ico = *get_codomain_dimensions(i);
+	  std::cout << "SIZE: " << ico.size() << std::endl;
+	  for (size_t k = 0; k < ico.size(); k++) ss << ico[k] << " ";
+	  ss << std::endl;
+	  throw std::runtime_error(ss.str());
+	}
+
+	ARRAY_TYPE slice;
+	slice.create(codoms[i]->get_dimensions().get(),codomain->get_data_ptr()+offset);
+	slice = *codoms[i];
+	offset += slice.get_number_of_elements();
+      }
+
+      return codomain;    
+    }
+
+    // Get individual operators
+    //
+    boost::shared_ptr< linearOperator<ARRAY_TYPE> > get_operator(size_t i)
+    {
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_operator : illegal index provided");
+      return operators_[i];
+    }
+
+    // Get pointer offset into codomain for individual operators "sub-codomains"
+    //
+    size_t get_offset(size_t i)
+    {
+      if( i>=operators_.size() )
+	throw std::runtime_error("encodingOperatorContainer::get_offset : illegal index provided");
+      return offsets_[i];
+    }
+  
+    // Add operator to the container
+    //
+    void add_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op )
+    {
+      boost::shared_ptr< std::vector<size_t> > codomain = op->get_codomain_dimensions();
+      
+      if( codomain->size() == 0 ){
+	throw std::runtime_error("encodingOperatorContainer::add_operator : codomain dimensions not set on operator");
+      }
+
+      size_t elements = 1;
+      for (size_t i=0; i<codomain->size(); i++){
+	elements *= codomain->at(i);
+      }
+    
+      if( elements == 0 ){
+	throw std::runtime_error("encodingOperatorContainer::add_operator : illegal codomain dimensions on operator");
+      }
+
+      if (offsets_.size() == 0){
+	offsets_.push_back(0);
+      } else{
+	offsets_.push_back(num_elements_);
+      }
+
+      num_elements_ += elements;
+      operators_.push_back(op);
+    }
+  
+    virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+      for (size_t i=0; i<operators_.size(); i++){
+	ARRAY_TYPE tmp_data(operators_[i]->get_codomain_dimensions(),out->get_data_ptr()+offsets_[i]);
+	operators_[i]->mult_M( in, &tmp_data, accumulate );
+      }
+    }
+
+    virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+      ARRAY_TYPE tmp_image(get_domain_dimensions());
+        
+      for (size_t i=0; i<operators_.size(); i++){
+      
+	boost::shared_ptr< linearOperator<ARRAY_TYPE> > op = operators_[i];
+	ARRAY_TYPE tmp_data(op->get_codomain_dimensions(),in->get_data_ptr()+offsets_[i]);
+      
+	// This operator is special in that it needs to apply the "internal" operator weights
+	//
+
+	op->mult_MH( &tmp_data, &tmp_image );
+
+	if( i == 0 && !accumulate ){
+	  *out = tmp_image;
+	  *out *= op->get_weight();
+	}
+	else {
+	  axpy( op->get_weight(), &tmp_image, out );
+	}
+      }
+    }
+  
+    virtual void mult_MH_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+
+      ARRAY_TYPE tmp_image(get_domain_dimensions());
+    
+      for (size_t i=0; i<operators_.size(); i++){
+      
+	boost::shared_ptr< linearOperator<ARRAY_TYPE> > op = operators_[i];
+      
+	// This operator is special in that it needs to apply the "internal" operator weights
+	//
+      
+	op->mult_MH_M( in, &tmp_image );
+	if( i == 0 && !accumulate ){
+	  *out = tmp_image;
+	  *out *= op->get_weight();
+	}
+	else {
+	  axpy( op->get_weight(), &tmp_image, out ) ;
+	}
+      }
+    }
+
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE> > clone()
+    {
+      return linearOperator< ARRAY_TYPE >::clone(this);
+    }
+  
+  protected:
+    std::vector< boost::shared_ptr< linearOperator<ARRAY_TYPE> > > operators_;
+    std::vector<size_t> offsets_;
+    size_t num_elements_;
+  };
+}
diff --git a/toolboxes/operators/generalOperator.h b/toolboxes/operators/generalOperator.h
new file mode 100644
index 0000000..4103771
--- /dev/null
+++ b/toolboxes/operators/generalOperator.h
@@ -0,0 +1,89 @@
+/** \file generalOperator.h
+    \brief Base class for all operators on which we can compute a gradient.
+*/
+
+#pragma once
+
+
+#include "complext.h"
+
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <class ARRAY> class generalOperator
+  {
+   public:
+
+    typedef typename ARRAY::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    generalOperator() : weight_(REAL(1)){};
+
+    generalOperator(std::vector<size_t> *dims) : weight_(REAL(1)){
+      set_domain_dimensions(dims);
+    }
+    
+    virtual ~generalOperator(){};
+
+    /**
+     * @brief Calculates the gradient of the operator at point "in"
+     * @param[in] in Input point at which to calculate the gradient
+     * @param[in,out] out Gradient
+     * @param[in] accumulate If false, overrides the output array. Otherwise adds result.
+     */
+    virtual void gradient(ARRAY* in, ARRAY* out, bool accumulate = false ) = 0;
+    
+
+    /**
+     * @brief Calculates the function value of the operator
+     * @param[in] in Point at which to calculate the value
+     * @return Function value at point "in"
+     */
+    virtual REAL magnitude(ARRAY* in)=0;
+
+
+    /**
+     * Set the domain dimension (image size) of the operator
+     * @param[in] dims Domain dimensions
+     */
+    virtual void set_domain_dimensions( std::vector<size_t> *dims )
+    {
+      if( dims == 0x0 ) throw std::runtime_error("Null pointer provided");
+      domain_dims_ = *dims;  
+    }
+    
+    /**
+     *
+     * @return The domain dimensions (image size) of the operator
+     */
+    virtual boost::shared_ptr< std::vector<size_t> > get_domain_dimensions()
+    {
+      std::vector<size_t> *dims = new std::vector<size_t>();
+      *dims = domain_dims_;
+      return boost::shared_ptr< std::vector<size_t> >(dims);
+    }
+    
+    /**
+     * Sets the weight of the operator
+     * @param[in] weight
+     */
+    virtual void set_weight( REAL weight ){ weight_ = weight; }
+
+    /**
+     *
+     * @return Weight of the operator
+     */
+    virtual REAL get_weight(){ return weight_; }
+    
+    void* operator new (size_t bytes) { return ::new char[bytes]; }
+    void operator delete (void *ptr) { delete [] static_cast <char *> (ptr); } 
+    void * operator new(size_t s, void * p) { return p; }
+    
+  protected:
+    REAL weight_;
+    std::vector<size_t> domain_dims_;
+  };  
+}
diff --git a/toolboxes/operators/gpu/CMakeLists.txt b/toolboxes/operators/gpu/CMakeLists.txt
new file mode 100644
index 0000000..fa7691d
--- /dev/null
+++ b/toolboxes/operators/gpu/CMakeLists.txt
@@ -0,0 +1,51 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUOPERATORS__)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(		
+  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/nfft/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  )
+
+cuda_add_library(gpuoperators SHARED 
+  cuPartialDerivativeOperator.cu
+  cuLaplaceOperator.cu
+  cuTvOperator.cu
+  cuTv1dOperator.cu
+  cuConvolutionOperator.cu
+  )
+
+target_link_libraries(gpuoperators 
+  gpucore 
+  gpunfft
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES}
+  ${CUDA_CUBLAS_LIBRARIES} 
+  )
+
+install(TARGETS gpuoperators DESTINATION lib)
+
+install(FILES 	
+  cuImageOperator.h
+  cuDiagonalOperator.h
+  cuPartialDerivativeOperator.h
+  cuConvolutionOperator.h
+  cuLaplaceOperator.h
+  cuTvOperator.h
+  cuTvPicsOperator.h
+  cuTv1dOperator.h
+  cuDownsampleOperator.h
+  cuFFTOperator.h
+  cuUpsampleOperator.h
+  hoCuTvOperator.h
+  hoCuTvPicsOperator.h
+  hoCuEncodingOperatorContainer.h
+  gpuoperators_export.h
+  hoCuOperator.h
+  DESTINATION include)
diff --git a/toolboxes/operators/gpu/cuConvolutionOperator.cu b/toolboxes/operators/gpu/cuConvolutionOperator.cu
new file mode 100644
index 0000000..967243d
--- /dev/null
+++ b/toolboxes/operators/gpu/cuConvolutionOperator.cu
@@ -0,0 +1,89 @@
+#include "cuConvolutionOperator.h"
+#include "vector_td_utilities.h"
+#include "cudaDeviceManager.h"
+#include "setup_grid.h"
+
+namespace Gadgetron {
+
+  // Mirror, but keep the origin unchanged
+  template<class T, unsigned int D> __global__ void
+  origin_mirror_kernel( vector_td<unsigned int,D> matrix_size, vector_td<unsigned int,D> origin, T *in, T *out, bool zero_fill )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    if( idx < prod(matrix_size) ){
+      
+      vector_td<unsigned int,D> in_co = idx_to_co<D>( idx, matrix_size );
+      vector_td<unsigned int,D> out_co = matrix_size-in_co;
+    
+      bool wrap = false;
+      for( unsigned int d=0; d<D; d++ ){
+	if( out_co.vec[d] == matrix_size.vec[d] ){
+	  out_co.vec[d] = 0;
+	  wrap = true;
+	}
+      }
+    
+      const unsigned int in_idx = co_to_idx<D>(in_co, matrix_size);
+      const unsigned int out_idx = co_to_idx<D>(out_co, matrix_size);
+
+      if( wrap && zero_fill )
+	out[out_idx] = T(0);
+      else
+	out[out_idx] = in[in_idx];
+    }
+  }
+  
+  // Mirror around the origin -- !! leaving the origin unchanged !!
+  // This creates empty space "on the left" that can be filled by zero (default) or the left-over entry.
+  template<class REAL, unsigned int D> void
+  cuConvolutionOperator<REAL,D>::origin_mirror( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out )
+  {
+    if( in == 0x0 || out == 0x0 ){
+      throw std::runtime_error( "origin_mirror: 0x0 ndarray provided");
+    }
+    
+    if( !in->dimensions_equal(out) ){
+      throw std::runtime_error("origin_mirror: image dimensions mismatch");
+    }
+    
+    if( in->get_number_of_dimensions() != D ){
+      std::stringstream ss;
+      ss << "origin_mirror: number of image dimensions is not " << D;
+      throw std::runtime_error(ss.str());
+    }
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *in->get_dimensions() );
+  
+    // Setup block/grid dimensions
+    dim3 blockDim; dim3 gridDim;
+    setup_grid( prod(matrix_size), &blockDim, &gridDim );
+
+    // Invoke kernel
+    origin_mirror_kernel<complext<REAL>,D><<< gridDim, blockDim >>> 
+      ( vector_td<unsigned int,D>(matrix_size), vector_td<unsigned int,D>(matrix_size>>1), in->get_data_ptr(), out->get_data_ptr(), true );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+
+  template <class REAL, unsigned int D> void 
+  cuConvolutionOperator<REAL,D>::operator_fft( bool forwards_transform, cuNDArray< complext<REAL> > *image )
+  {
+    if( forwards_transform )
+      cuNDFFT<REAL>::instance()->fft(image);
+    else
+      cuNDFFT<REAL>::instance()->ifft(image);
+  }    
+  
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,1>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,2>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,3>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<float,4>;
+
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,1>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,2>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,3>;
+  template EXPORTGPUOPERATORS class cuConvolutionOperator<double,4>;
+  
+}
diff --git a/toolboxes/operators/gpu/cuConvolutionOperator.h b/toolboxes/operators/gpu/cuConvolutionOperator.h
new file mode 100644
index 0000000..ac23cd4
--- /dev/null
+++ b/toolboxes/operators/gpu/cuConvolutionOperator.h
@@ -0,0 +1,33 @@
+/** \file cuConvolutionOperator.h
+    \brief Convolution operator, GPU based.
+*/
+
+#pragma once
+
+
+#include "cuNDArray_math.h"
+#include "cuNDFFT.h"
+#include "vector_td_utilities.h"
+#include "convolutionOperator.h"
+#include "gpuoperators_export.h"
+
+namespace Gadgetron{
+
+  template <class REAL, unsigned int D> class EXPORTGPUOPERATORS cuConvolutionOperator 
+    : public convolutionOperator<cuNDArray<complext<REAL> >, D >
+  {
+    
+  public:
+  
+    cuConvolutionOperator() : convolutionOperator<cuNDArray<complext<REAL> >, D>() {  }
+    virtual ~cuConvolutionOperator() {}
+        
+    virtual void operator_fft( bool forwards_transform, cuNDArray< complext<REAL> > *image );
+    virtual void origin_mirror( cuNDArray< complext<REAL> > *in, cuNDArray< complext<REAL> > *out );
+    
+    virtual boost::shared_ptr< linearOperator<cuNDArray< complext<REAL> > > > clone()
+    {
+      return linearOperator< cuNDArray< complext<REAL> > >::clone(this);
+    }
+  };
+}
diff --git a/toolboxes/operators/gpu/cuDiagonalOperator.h b/toolboxes/operators/gpu/cuDiagonalOperator.h
new file mode 100644
index 0000000..5f3c038
--- /dev/null
+++ b/toolboxes/operators/gpu/cuDiagonalOperator.h
@@ -0,0 +1,20 @@
+/** \file cuDiagonalOperator.h
+    \brief Diagonal matrix regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "diagonalOperator.h"
+
+namespace Gadgetron{
+
+  template <class T> class cuDiagonalOperator : public diagonalOperator< cuNDArray<T> >
+  {
+  public:
+    cuDiagonalOperator() : diagonalOperator< cuNDArray<T> >() {}
+    virtual ~cuDiagonalOperator() {}
+  };
+}
diff --git a/toolboxes/operators/gpu/cuDownsampleOperator.h b/toolboxes/operators/gpu/cuDownsampleOperator.h
new file mode 100644
index 0000000..57d3912
--- /dev/null
+++ b/toolboxes/operators/gpu/cuDownsampleOperator.h
@@ -0,0 +1,28 @@
+/** \file cuDownsampleOperator.h
+    \brief Instantiation of the downsampling operator on the gpu.
+    
+    The file cuDownsampleOperator.h is a convienience wrapper for the device independent downsampleOperator class.
+    The class cuDownsampleOperator instantiates the downsampleOperator for the cuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_utils.h"
+#include "downsampleOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class cuDownsampleOperator
+      \brief Instantiation of the downsample operator on the gpu.
+      
+      The class cuDownsampleOperator is a convienience wrapper for the device independent downsampleOperator.
+      cuDownsampleOperator instantiates the downsampleOperator for type cuNDArray<T>.
+  */
+  template <class T, unsigned int D> class cuDownsampleOperator : public downsampleOperator<cuNDArray<T>,D>
+  {
+  public:    
+    cuDownsampleOperator() : downsampleOperator<cuNDArray<T>,D>() {}
+    virtual ~cuDownsampleOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/cuFFTOperator.h b/toolboxes/operators/gpu/cuFFTOperator.h
new file mode 100644
index 0000000..5316019
--- /dev/null
+++ b/toolboxes/operators/gpu/cuFFTOperator.h
@@ -0,0 +1,29 @@
+/** \file cuFFTOperator.h
+    \brief Instantiation of the Cartesian FFT operator on the gpu.
+    
+    The file cuFFTOperator.h is a convienience wrapper for the device independent FFTOperator class.
+    The class cuFFTOperator instantiates the FFTOperator for cuNDArray< complext<T> >
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "FFTOperator.h"
+#include "cuNDFFT.h"
+
+namespace Gadgetron{
+  
+  /** \class cuFFTOperator
+      \brief Instantiation of the Cartesian FFT operator on the gpu.
+      
+      The class cuFFTOperator is a convienience wrapper for the device independent FFTOperator.
+      It instantiates the FFTOperator for type cuNDArray<T>.
+  */
+  template <class T> class cuFFTOperator : public FFTOperator< cuNDArray< complext<T> >, cuNDFFT<T> >
+  {
+  public:    
+    cuFFTOperator() : FFTOperator< cuNDArray< complext<T> >, cuNDFFT<T> >() {}
+    virtual ~cuFFTOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/cuIdentityOperator.h b/toolboxes/operators/gpu/cuIdentityOperator.h
new file mode 100644
index 0000000..57158e2
--- /dev/null
+++ b/toolboxes/operators/gpu/cuIdentityOperator.h
@@ -0,0 +1,28 @@
+/** \file cuIdentityOperator.h
+    \brief Instantiation of the identity operator on the gpu.
+    
+    The file cuIdentityOperator.h is a convienience wrapper for the device independent identityOperator class.
+    The class cuIdentityOperator instantiates the identityOperator for the cuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "identityOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class cuIdentityOperator
+      \brief Instantiation of the identity operator on the gpu.
+      
+      The class cuIdentityOperator is a convienience wrapper for the device independent identityOperator.
+      cuIdentityOperator instantiates the identityOperator for type cuNDArray<T>.
+  */
+  template <class T> class cuIdentityOperator : public identityOperator< cuNDArray<T> >
+  {
+  public:    
+    cuIdentityOperator() : identityOperator< cuNDArray<T> >() {}
+    virtual ~cuIdentityOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/cuImageOperator.h b/toolboxes/operators/gpu/cuImageOperator.h
new file mode 100644
index 0000000..8b38c54
--- /dev/null
+++ b/toolboxes/operators/gpu/cuImageOperator.h
@@ -0,0 +1,69 @@
+/** \file cuImageOperator.h
+    \brief Image regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "complext.h"
+#include "imageOperator.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template <class T> class cuImageOperator : public imageOperator< cuNDArray<typename realType<T>::Type >, cuNDArray<T> >
+  {
+  public:
+
+    cuImageOperator() : imageOperator< cuNDArray<typename realType<T>::Type >, cuNDArray<T> >() {}
+    virtual ~cuImageOperator() {}    
+
+    typedef typename imageOperator< cuNDArray<typename realType<T>::Type>, cuNDArray<T> >::REAL REAL;
+
+    virtual boost::shared_ptr< linearOperator< cuNDArray<T> > > clone() {
+      return linearOperator< cuNDArray<T> >::clone(this);
+    }
+
+  protected:
+
+    // Windows/Cuda has given some real issues about defining min
+    // - so for now we resolve to defining our own:
+    const unsigned int& my_min(const unsigned int& a, const unsigned int& b) {
+        return (a>b)?b:a;
+    }
+
+    // Estimate offset to the regularization image
+    virtual REAL estimate_offset()
+    {
+      // Estimation based on simple histogram analysis:
+      // Returns an estimation of the "average" intensity of the 'sigma' proportion of the image with the smallest intensities.
+      //
+      
+      // This simple code is fast enough (<.5 ms on a 192x192 image) that we can just copy the hoImageOperators host code
+      //
+
+      const unsigned int granularity = 50000; 
+      std::vector<unsigned int> histogram(granularity,0);
+      REAL max_value = this->image_->at(amax(this->image_.get()));
+      boost::shared_ptr<hoNDArray<REAL> > tmp = this->image_->to_host();
+      REAL *d = tmp->get_data_ptr();
+      
+      for( unsigned int i=0; i<this->image_->get_number_of_elements(); i++) {
+	unsigned int bin = my_min(static_cast<unsigned int>(std::floor((d[i]/max_value)*granularity)), granularity-1);
+	histogram[bin]++;
+      }
+      
+      //Find 1th percentile
+      //
+      
+      unsigned int cumsum = 0, counter = 0;
+      while (cumsum < (unsigned int)(REAL(0.01)*this->image_->get_number_of_elements())) {
+	cumsum += histogram[counter++];
+      }      
+
+      return  REAL(counter+1)*max_value/granularity;
+    }
+  };
+}
diff --git a/toolboxes/operators/gpu/cuLaplaceOperator.cu b/toolboxes/operators/gpu/cuLaplaceOperator.cu
new file mode 100644
index 0000000..ded5974
--- /dev/null
+++ b/toolboxes/operators/gpu/cuLaplaceOperator.cu
@@ -0,0 +1,95 @@
+#include "cuLaplaceOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+  // Template Power function
+  template<unsigned int i, unsigned int j>
+  struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value};
+  };
+
+  template <unsigned int i>
+  struct Pow<i,1>
+  {
+    enum { Value = i};
+  };
+
+  template<class T, unsigned int D, unsigned int dim> class inner_laplace_functor{
+  public:
+		static __device__ __inline__ void apply(T& val,const T* in, const typename intd<D>::Type dims,const typename intd<D>::Type co, typename intd<D>::Type& stride){
+			for (int d = -1; d < 2; d++)
+				stride[dim]=d;
+				inner_laplace_functor<T,D,dim-1>::apply(val,in,dims,co,stride);
+		}
+  };
+  template<class T, unsigned int D> class inner_laplace_functor<T,D,0>{
+  public:
+  	static __device__ __inline__ void apply(T& val,const T* in, const typename intd<D>::Type dims,const typename intd<D>::Type co, typename intd<D>::Type& stride){
+  		typename intd<D>::Type coN = (co+dims+stride)%dims;
+  		val -= in[co_to_idx<D>(coN,dims)];
+  	}
+  };
+
+  template<class REAL, class T, unsigned int D> __global__ void
+  laplace_kernel( typename intd<D>::Type dims, T *in, T *out )
+  {  
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+    if( idx < prod(dims) ){
+    
+      T val = T(0);
+      typename intd<D>::Type coN;
+
+      typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+
+      typename intd<D>::Type stride(0);
+
+
+      inner_laplace_functor<T,D,D-1>::apply(val,in,dims,co,stride);
+      out[idx] = val+in[co_to_idx<D>(co, dims)]*((REAL) Pow<3,D>::Value);
+    }
+  }
+
+  template< class T, unsigned int D> void
+  cuLaplaceOperator<T,D>::compute_laplace( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate )
+  {
+  
+    if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+      throw std::runtime_error("laplaceOperator::compute_laplace : array dimensions mismatch.");
+
+    }
+  
+    typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+
+    dim3 dimBlock( dims[0] );
+    dim3 dimGrid( prod(dims)/dims[0] );
+  
+    // Invoke kernel
+    laplace_kernel<typename realType<T>::Type ,T,D><<< dimGrid, dimBlock >>> (dims, in->get_data_ptr(), out->get_data_ptr() );
+  
+    CHECK_FOR_CUDA_ERROR();
+  }
+  
+  // Instantiations
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float, 3>;
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float_complext, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float_complext, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<float_complext, 3>;
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double, 3>;
+
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double_complext, 1>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double_complext, 2>;
+  template class EXPORTGPUOPERATORS cuLaplaceOperator<double_complext, 3>;
+}
diff --git a/toolboxes/operators/gpu/cuLaplaceOperator.h b/toolboxes/operators/gpu/cuLaplaceOperator.h
new file mode 100644
index 0000000..4419007
--- /dev/null
+++ b/toolboxes/operators/gpu/cuLaplaceOperator.h
@@ -0,0 +1,28 @@
+/** \file cuLaplaceOperator.h
+    \brief Laplace regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "laplaceOperator.h"
+
+#include "gpuoperators_export.h"
+
+namespace Gadgetron{
+
+  template < class T, unsigned int D> class EXPORTGPUOPERATORS cuLaplaceOperator : public laplaceOperator<D, cuNDArray<T> >
+  {    
+  public:
+    
+    cuLaplaceOperator() : laplaceOperator< D, cuNDArray<T> >() {}
+    virtual ~cuLaplaceOperator() {}
+    
+    virtual boost::shared_ptr< linearOperator< cuNDArray<T> > > clone(){
+      return linearOperator<cuNDArray<T> >::clone(this);
+    }
+    
+  protected:
+    virtual void compute_laplace( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate );    
+  };
+}
diff --git a/toolboxes/operators/gpu/cuMultiplicationOperatorContainer.h b/toolboxes/operators/gpu/cuMultiplicationOperatorContainer.h
new file mode 100644
index 0000000..c2c00b7
--- /dev/null
+++ b/toolboxes/operators/gpu/cuMultiplicationOperatorContainer.h
@@ -0,0 +1,23 @@
+/** \file cuMultiplicationOperatorContainer.h
+    \brief Operator used to chain together (concatenate) a series of operators by multiplication, GPU version.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "multiplicationOperatorContainer.h"
+
+
+namespace Gadgetron{
+template <class REAL, class T> class cuMultiplicationOperatorContainer 
+  : public multiplicationOperatorContainer< REAL, cuNDArray<T> >
+{
+public:
+  cuMultiplicationOperatorContainer() : multiplicationOperatorContainer< REAL, cuNDArray<T> >() {}
+  virtual ~cuMultiplicationOperatorContainer() {}
+  
+  virtual boost::shared_ptr< linearOperator< REAL, cuNDArray<T> > > clone(){
+    return linearOperator< REAL, cuNDArray<T> >::clone(this);
+  }  
+};
+}
diff --git a/toolboxes/operators/gpu/cuPartialDerivativeOperator.cu b/toolboxes/operators/gpu/cuPartialDerivativeOperator.cu
new file mode 100644
index 0000000..0f5e28c
--- /dev/null
+++ b/toolboxes/operators/gpu/cuPartialDerivativeOperator.cu
@@ -0,0 +1,145 @@
+/** \file cuPartialDerivativeOperator.h
+    \brief Implementation of the partial derivative operator for the gpu.
+*/
+
+#include "cuPartialDerivativeOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> __global__ void
+  first_order_partial_derivative_kernel( typename intd<D>::Type stride, 
+                                         typename intd<D>::Type dims, 
+                                         T *in, T *out )
+  {
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+    if( idx < prod(dims) ){
+
+      T valN, valC;
+
+      typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+      typename intd<D>::Type coN = (co+dims+stride)%dims;
+    
+      valN = in[co_to_idx<D>(coN, dims)];
+      valC = in[co_to_idx<D>(co, dims)];
+    
+      T val = valN-valC;
+    
+      out[idx] += val;
+    }
+  }
+
+  template<class T, unsigned int D> __global__ void
+  second_order_partial_derivative_kernel( typename intd<D>::Type forwards_stride, 
+                                          typename intd<D>::Type adjoint_stride, 
+                                          typename intd<D>::Type dims, 
+                                          T *in, T *out )
+  {
+    const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+    if( idx < prod(dims) ){
+
+      T valN1, valN2, valC;
+
+      typename intd<D>::Type co = idx_to_co<D>(idx, dims);
+      typename intd<D>::Type coN1 = (co+dims+forwards_stride)%dims;
+      typename intd<D>::Type coN2 = (co+dims+adjoint_stride)%dims;
+    
+      valN1 = in[co_to_idx<D>(coN1, dims)];
+      valN2 = in[co_to_idx<D>(coN2, dims)];
+      valC = in[co_to_idx<D>(co, dims)];
+    
+      T val = valC+valC-valN1-valN2;
+    
+      out[idx] += val;
+    }
+  }
+
+  template< class T, unsigned int D> void
+  cuPartialDerivativeOperator<T,D>::compute_partial_derivative( typename int64d<D>::Type stride,
+                                                                cuNDArray<T> *in, 
+                                                                cuNDArray<T> *out, 
+                                                                bool accumulate )
+  {
+    if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+      throw std::runtime_error( "partialDerivativeOperator::compute_partial_derivative : array dimensions mismatch.");
+
+    }
+
+
+    if (!accumulate) clear(out);
+    
+    typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+    dim3 dimBlock( dims.vec[0] );
+    dim3 dimGrid( 1, dims.vec[D-1] );
+  
+    for( unsigned int d=1; d<D-1; d++ )
+      dimGrid.x *= dims.vec[d];
+  
+    size_t elements = in->get_number_of_elements();
+
+    // Invoke kernel
+    for (size_t i = 0; i < elements/prod(dims); i++)
+    	first_order_partial_derivative_kernel<T,D><<< dimGrid, dimBlock >>> 
+        ( vector_td<int,D>(stride), vector_td<int,D>(dims),
+          in->get_data_ptr()+i*prod(dims), out->get_data_ptr()+i*prod(dims));
+  
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  template<class T, unsigned int D> void
+  cuPartialDerivativeOperator<T,D>::compute_second_order_partial_derivative( typename int64d<D>::Type forwards_stride,
+                                                                             typename int64d<D>::Type adjoint_stride, 
+                                                                             cuNDArray<T> *in, cuNDArray<T> *out, 
+                                                                             bool accumulate )
+  {  
+    if( !in || !out || in->get_number_of_elements() != out->get_number_of_elements() ){
+      throw std::runtime_error( "partialDerivativeOperator::compute_second_order_partial_derivative : array dimensions mismatch.");
+    }
+    
+    if (!accumulate) clear(out);
+
+    typename int64d<D>::Type dims = vector_td<long long,D>( from_std_vector<size_t,D>( *(in->get_dimensions().get()) ));
+    dim3 dimBlock( dims.vec[0] );
+    dim3 dimGrid( 1, dims.vec[D-1] );
+  
+    for( unsigned int d=1; d<D-1; d++ )
+      dimGrid.x *= dims.vec[d];
+  
+    size_t elements = in->get_number_of_elements();
+
+    // Invoke kernel
+		for (size_t i = 0; i < elements/prod(dims); i++)
+			second_order_partial_derivative_kernel<T,D><<< dimGrid, dimBlock >>> 
+        ( vector_td<int,D>(forwards_stride), vector_td<int,D>(adjoint_stride), vector_td<int,D>(dims),
+          in->get_data_ptr()+i*prod(dims), out->get_data_ptr()+i*prod(dims) );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+
+  //
+  // Instantiations
+  //
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float, 4>;
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<float_complext, 4>;
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double, 4>;
+
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 1>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 2>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 3>;
+  template class EXPORTGPUOPERATORS cuPartialDerivativeOperator<double_complext, 4>;
+}
diff --git a/toolboxes/operators/gpu/cuPartialDerivativeOperator.h b/toolboxes/operators/gpu/cuPartialDerivativeOperator.h
new file mode 100644
index 0000000..3e47e71
--- /dev/null
+++ b/toolboxes/operators/gpu/cuPartialDerivativeOperator.h
@@ -0,0 +1,38 @@
+/** \file cuPartialDerivativeOperator.h
+    \brief Partial derivative regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "partialDerivativeOperator.h"
+
+#include "gpuoperators_export.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D> class EXPORTGPUOPERATORS cuPartialDerivativeOperator 
+    : public partialDerivativeOperator<D, cuNDArray<T> >
+  {
+  public:
+    
+    cuPartialDerivativeOperator() : 
+      partialDerivativeOperator< D, cuNDArray<T> >(0) {}
+    
+    cuPartialDerivativeOperator( size_t dimension ) : 
+      partialDerivativeOperator<D, cuNDArray<T> >( dimension ) {}
+    
+    virtual ~cuPartialDerivativeOperator() {}
+    
+    virtual void compute_partial_derivative( typename int64d<D>::Type stride, cuNDArray<T> *in,
+                                             cuNDArray<T> *out, bool accumulate );  
+    
+    virtual void compute_second_order_partial_derivative( typename int64d<D>::Type forwards_stride,
+                                                          typename int64d<D>::Type adjoint_stride, 
+                                                          cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate );  
+    
+    virtual boost::shared_ptr< linearOperator< cuNDArray<T> > > clone() {
+      return linearOperator< cuNDArray<T> >::clone(this);
+    }    
+  };
+}
diff --git a/toolboxes/operators/gpu/cuTv1dOperator.cu b/toolboxes/operators/gpu/cuTv1dOperator.cu
new file mode 100644
index 0000000..5a58ef6
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTv1dOperator.cu
@@ -0,0 +1,129 @@
+#include "cuTv1dOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+
+#include <iostream>
+
+using namespace Gadgetron;
+
+template<class REAL, class T, unsigned int D> static inline	__device__ REAL gradient(const T* in, const vector_td<int,D>& dims, vector_td<int,D>& co){
+
+	T xi = in[co_to_idx<D>((co+dims)%dims,dims)];
+
+	co[D-1]+=1;
+	T dt = in[co_to_idx<D>((co+dims)%dims,dims)];
+	REAL grad = norm(xi-dt);
+	co[D-1]-=1;
+
+	return sqrt(grad);
+}
+
+
+template<class REAL, class T, unsigned int D> static __global__ void tvGradient_kernel(const T* in, T* out, const vector_td<int,D> dims,REAL limit,REAL weight){
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims) ){
+		T xi = in[idx];
+		T result=T(0);
+
+		vector_td<int,D> co = idx_to_co<D>(idx, dims);
+
+		REAL grad = gradient<REAL,T,D>(in,dims,co);
+
+
+		if (grad > limit) {
+			result += xi/grad;
+
+			co[D-1]+=1;
+			result -= in[co_to_idx<D>((co+dims)%dims,dims)]/grad;
+			co[D-1]-=1;
+
+		}
+
+		co[D-1]-=1;
+		grad = gradient<REAL,T,D>(in,dims,co);
+		if (grad > limit) {
+			result +=(xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+		}
+		co[D-1]+=1;
+
+		out[idx] += weight*result;
+
+	}
+}
+
+
+template<class T, unsigned int D> void cuTv1DOperator<T,D>::gradient (cuNDArray<T> * in,cuNDArray<T> * out, bool accumulate){
+	if (!accumulate) clear(out);
+
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	int elements = in->get_number_of_elements();
+
+	int threadsPerBlock =std::min(prod(dims),cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(1,prod(dims)/cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	for (int i =0; i < (elements/prod(dims)); i++){
+		tvGradient_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out->get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+	}
+
+
+	cudaDeviceSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, class T, unsigned int D> static __global__ void tvMagnitude_kernel(const T* in,T* out,const vector_td<int,D> dims,REAL limit,REAL weight)
+{
+	const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+	if( idx < prod(dims) ){
+		vector_td<int,D> co = idx_to_co<D>(idx, dims);
+		REAL grad = gradient<REAL,T,D>(in,dims,co);
+		out[idx] = grad*weight;
+	}
+}
+
+template<class T, unsigned int D> typename realType<T>::Type cuTv1DOperator<T,D>::magnitude (cuNDArray<T> * in){
+
+	cuNDArray<T> out(*in);
+	const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+	int elements = in->get_number_of_elements();
+
+	int threadsPerBlock =std::min(prod(dims),cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimBlock( threadsPerBlock);
+	int totalBlocksPerGrid = std::max(1,prod(dims)/cudaDeviceManager::Instance()->max_blockdim());
+	dim3 dimGrid(totalBlocksPerGrid);
+
+	for (int i =0; i < (elements/prod(dims)); i++){
+		tvMagnitude_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out.get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+	}
+
+
+	cudaDeviceSynchronize();
+	CHECK_FOR_CUDA_ERROR();
+	return asum(&out);
+}
+
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float,4>;
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double,4>;
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<float_complext,4>;
+
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,1>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,2>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,3>;
+template class EXPORTGPUOPERATORS cuTv1DOperator<double_complext,4>;
diff --git a/toolboxes/operators/gpu/cuTv1dOperator.h b/toolboxes/operators/gpu/cuTv1dOperator.h
new file mode 100644
index 0000000..e0c0d68
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTv1dOperator.h
@@ -0,0 +1,38 @@
+/** \file cuTv1DOperator.h
+    \brief Total variation regularization operator, GPU based. Optimized 1D version.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "generalOperator.h"
+#include "complext.h"
+#include "gpuoperators_export.h"
+
+namespace Gadgetron{
+  
+  template<class T, unsigned int D> class EXPORTGPUOPERATORS cuTv1DOperator : public generalOperator< cuNDArray<T> >
+  {    
+
+  protected:
+    typedef typename realType<T>::Type REAL;   
+    
+  public:
+    
+    cuTv1DOperator() : generalOperator< cuNDArray<T> >(){
+      limit_ = REAL(1e-8);      
+    }
+    
+    virtual ~cuTv1DOperator(){};
+
+    void set_limit(REAL limit){
+      limit_ = limit;
+    }
+
+    virtual void gradient(cuNDArray<T>*,cuNDArray<T>*, bool accumulate=false);
+    virtual REAL magnitude(cuNDArray<T>*);
+
+  protected:
+    REAL limit_;    
+  };  
+}
diff --git a/toolboxes/operators/gpu/cuTvOperator.cu b/toolboxes/operators/gpu/cuTvOperator.cu
new file mode 100644
index 0000000..ccc630f
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTvOperator.cu
@@ -0,0 +1,132 @@
+#include "cuTvOperator.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "vector_td_utilities.h"
+#include "complext.h"
+#include <iostream>
+#include "check_CUDA.h"
+#include "cudaDeviceManager.h"
+
+using namespace Gadgetron;
+
+template<class REAL, class T, unsigned int D> static inline  __device__ REAL gradient(const T* in, const vector_td<int,D>& dims, vector_td<int,D>& co)
+{
+  REAL grad = REAL(0);
+  T xi = in[co_to_idx<D>((co+dims)%dims,dims)];
+  for (int i = 0; i < D; i++){
+    co[i]+=1;
+    T dt = in[co_to_idx<D>((co+dims)%dims,dims)];
+    grad += norm(xi-dt);
+    co[i]-=1;
+  }
+  return sqrt(grad);
+}
+
+
+template<class REAL, class T, unsigned int D> static __global__ void tvGradient_kernel(const T* in, T* out, const vector_td<int,D> dims,REAL limit,REAL weight)
+{
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if( idx < prod(dims) ){
+    T xi = in[idx];
+    T result=T(0);
+
+    vector_td<int,D> co = idx_to_co<D>(idx, dims);
+
+    REAL grad = gradient<REAL,T,D>(in,dims,co);
+
+    if (grad > limit) {
+      result += REAL(D)*xi/grad;
+      for (int i = 0; i < D; i++){
+	co[i]+=1;
+	result -= in[co_to_idx<D>((co+dims)%dims,dims)]/grad;
+	co[i]-=1;
+      }
+    }
+
+    for (int i = 0; i < D; i++){
+      co[i]-=1;
+      grad = gradient<REAL,T,D>(in,dims,co);
+      if (grad > limit) {
+	result +=(xi-in[co_to_idx<D>((co+dims)%dims,dims)])/grad;
+      }
+      co[i]+=1;
+    }
+    out[idx] += result*weight;
+  }
+}
+
+
+
+
+
+template<class T, unsigned int D> void cuTvOperator<T,D>::gradient (cuNDArray<T> * in,cuNDArray<T> * out, bool accumulate)
+{
+  if (!accumulate) 
+    clear(out);
+
+  const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+  int elements = in->get_number_of_elements();
+
+  int threadsPerBlock =std::min(prod(dims),cudaDeviceManager::Instance()->max_blockdim());
+  dim3 dimBlock( threadsPerBlock);
+  int totalBlocksPerGrid = std::max(1,prod(dims)/cudaDeviceManager::Instance()->max_blockdim());
+  dim3 dimGrid(totalBlocksPerGrid);
+
+  for (int i =0; i < (elements/prod(dims)); i++){
+    tvGradient_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out->get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+  }
+  
+  cudaDeviceSynchronize();
+  CHECK_FOR_CUDA_ERROR();
+}
+
+template<class REAL, class T, unsigned int D> static __global__ void tvMagnitude_kernel(const T* in,T* out,const vector_td<int,D> dims,REAL limit,REAL weight)
+{
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if( idx < prod(dims) ){
+    vector_td<int,D> co = idx_to_co<D>(idx, dims);
+    REAL grad = gradient<REAL,T,D>(in,dims,co);
+    out[idx] = grad*weight;
+  }
+}
+
+
+template<class T, unsigned int D> typename realType<T>::Type cuTvOperator<T,D>::magnitude (cuNDArray<T> * in)
+{
+  cuNDArray<T> out(in->get_dimensions());
+  const typename intd<D>::Type dims = vector_td<int,D>( from_std_vector<size_t,D>(*(in->get_dimensions())));
+  int elements = in->get_number_of_elements();
+
+  int threadsPerBlock =std::min(prod(dims),cudaDeviceManager::Instance()->max_blockdim());
+  dim3 dimBlock( threadsPerBlock);
+  int totalBlocksPerGrid = std::max(1,prod(dims)/cudaDeviceManager::Instance()->max_blockdim());
+  dim3 dimGrid(totalBlocksPerGrid);
+
+  for (int i =0; i < (elements/prod(dims)); i++){
+    tvMagnitude_kernel<<<dimGrid,dimBlock>>>(in->get_data_ptr()+i*prod(dims),out.get_data_ptr()+i*prod(dims),dims,limit_,this->weight_);
+  }
+
+  cudaDeviceSynchronize();
+  CHECK_FOR_CUDA_ERROR();
+  return asum(&out);
+}
+
+template class EXPORTGPUOPERATORS cuTvOperator<float,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<float,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<float,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<float,4>;
+
+template class EXPORTGPUOPERATORS cuTvOperator<double,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<double,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<double,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<double,4>;
+
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<float_complext,4>;
+
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,1>;
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,2>;
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,3>;
+template class EXPORTGPUOPERATORS cuTvOperator<double_complext,4>;
diff --git a/toolboxes/operators/gpu/cuTvOperator.h b/toolboxes/operators/gpu/cuTvOperator.h
new file mode 100644
index 0000000..5fa70c0
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTvOperator.h
@@ -0,0 +1,42 @@
+/** \file cuTvOperator.h
+    \brief Total variation regularization operator, GPU based.
+*/
+
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "generalOperator.h"
+
+#include "complext.h"
+#include "gpuoperators_export.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUOPERATORS cuTvOperator 
+    : public generalOperator<cuNDArray<T> > 
+  {
+
+  protected:
+    typedef typename realType<T>::Type REAL;
+    
+  public:
+
+    cuTvOperator() : generalOperator<cuNDArray<T> >(){
+      limit_ = REAL(1e-8);
+    }
+
+    virtual ~cuTvOperator(){};
+
+    void set_limit(REAL limit){
+      limit_ = limit;
+    }
+
+    virtual void gradient(cuNDArray<T>*,cuNDArray<T>*, bool accumulate=false);
+    virtual REAL magnitude(cuNDArray<T>*);
+
+  protected:
+
+  protected:    
+    REAL limit_;
+  };
+}
diff --git a/toolboxes/operators/gpu/cuTvPicsOperator.h b/toolboxes/operators/gpu/cuTvPicsOperator.h
new file mode 100644
index 0000000..f321082
--- /dev/null
+++ b/toolboxes/operators/gpu/cuTvPicsOperator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "cuTvOperator.h"
+#include "tvPicsOperator.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class cuTvPicsOperator 
+    : public tvPicsOperator< cuNDArray<T>, cuTvOperator<T,D>, typename realType<T>::Type >
+  {
+  public:
+    cuTvPicsOperator() : tvPicsOperator< cuNDArray<T>, cuTvOperator<T,D>, typename realType<T>::Type >() {}
+    virtual ~cuTvPicsOperator() {}
+  };    
+}
diff --git a/toolboxes/operators/gpu/cuUpsampleOperator.h b/toolboxes/operators/gpu/cuUpsampleOperator.h
new file mode 100644
index 0000000..06fd812
--- /dev/null
+++ b/toolboxes/operators/gpu/cuUpsampleOperator.h
@@ -0,0 +1,28 @@
+/** \file cuUpsampleOperator.h
+    \brief Instantiation of the upsampling operator on the gpu.
+    
+    The file cuUpsampleOperator.h is a convienience wrapper for the device independent upsampleOperator class.
+    The class cuUpsampleOperator instantiates the upsampleOperator for the cuNDArray
+    and the header furthermore includes additional neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_utils.h"
+#include "upsampleOperator.h"
+
+namespace Gadgetron{
+  
+  /** \class cuUpsampleOperator
+      \brief Instantiation of the upsample operator on the gpu.
+      
+      The class cuUpsampleOperator is a convienience wrapper for the device independent upsampleOperator.
+      cuUpsampleOperator instantiates the upsampleOperator for type cuNDArray<T>.
+  */
+  template <class T, unsigned int D> class cuUpsampleOperator : public upsampleOperator<cuNDArray<T>, D>
+  {
+  public:    
+    cuUpsampleOperator() : upsampleOperator<cuNDArray<T>,D>() {}
+    virtual ~cuUpsampleOperator() {}
+  }; 
+}
diff --git a/toolboxes/operators/gpu/gpuoperators_export.h b/toolboxes/operators/gpu/gpuoperators_export.h
new file mode 100644
index 0000000..3cd3385
--- /dev/null
+++ b/toolboxes/operators/gpu/gpuoperators_export.h
@@ -0,0 +1,18 @@
+/** \file gpuoperators_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUOPERATORS_EXPORT_H_
+#define GPUOPERATORS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUOPERATORS__) || defined (gpusolvers_EXPORTS)
+#define EXPORTGPUOPERATORS __declspec(dllexport)
+#else
+#define EXPORTGPUOPERATORS __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUOPERATORS
+#endif
+
+#endif /* GPUOPERATORS_EXPORT_H_ */
diff --git a/toolboxes/operators/gpu/hoCuEncodingOperatorContainer.h b/toolboxes/operators/gpu/hoCuEncodingOperatorContainer.h
new file mode 100644
index 0000000..0cf3890
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuEncodingOperatorContainer.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "hoCuNDArray.h"
+#include "hoCuNDArray_operators.h"
+#include "hoCuNDArray_elemwise.h"
+#include "hoCuNDArray_blas.h"
+#include "encodingOperatorContainer.h"
+
+namespace Gadgetron{
+  
+  template<class T> class hoCuEncodingOperatorContainer 
+    : public encodingOperatorContainer< hoCuNDArray<T> >
+  {
+  public:
+    hoCuEncodingOperatorContainer() : encodingOperatorContainer< hoCuNDArray<T> >() {}
+    virtual ~hoCuEncodingOperatorContainer() {}
+    
+    virtual boost::shared_ptr< linearOperator< hoCuNDArray<T> > > clone(){
+      return linearOperator< hoCuNDArray<T> >::clone(this);
+    }  
+  }; 
+}
diff --git a/toolboxes/operators/gpu/hoCuOperator.h b/toolboxes/operators/gpu/hoCuOperator.h
new file mode 100644
index 0000000..d22025c
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuOperator.h
@@ -0,0 +1,55 @@
+#pragma once
+#include "hoCuNDArray_math.h"
+#include "linearOperator.h"
+
+#include <boost/shared_ptr.hpp>
+namespace Gadgetron{
+
+
+template<class T > class hoCuOperator : public linearOperator<hoCuNDArray<T> > {
+
+	public:
+		hoCuOperator(){};
+		hoCuOperator(boost::shared_ptr<linearOperator<hoNDArray<T> > > _op): op(_op) {};
+		virtual ~hoCuOperator(){};
+
+		virtual void mult_M(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->mult_M(in,out,accumulate);
+		}
+		virtual void mult_MH(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->mult_MH(in,out,accumulate);
+		}
+
+		virtual void gradient(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->gradient(in,out,accumulate);
+		}
+		virtual void mult_MH_M(hoCuNDArray<T>* in, hoCuNDArray<T>* out, bool accumulate=false){
+			op->mult_MH_M(in,out,accumulate);
+		}
+
+		virtual boost::shared_ptr< linearOperator< hoCuNDArray<T> > > clone() {
+			return linearOperator< hoCuNDArray<T> >::clone(this);
+		}
+		virtual boost::shared_ptr< std::vector<unsigned int> > get_codomain_dimensions(){
+			return op->get_codomain_dimensions();
+		}
+		virtual boost::shared_ptr< std::vector<unsigned int> > get_domain_dimensions(){
+			return op->get_domain_dimensions();
+		}
+		 virtual void set_weight( typename realType<T>::Type weight ){ op->set_weight(weight); };
+		virtual typename realType<T>::Type get_weight(){ return op->get_weight(); };
+		virtual void set_codomain_dimensions( std::vector<unsigned int> *dims ){
+			op->set_codomain_dimensions(dims);
+		}
+		virtual void set_domain_dimensions( std::vector<unsigned int> *dims ){
+			op->set_domain_dimensions(dims);
+		}
+	private:
+	 boost::shared_ptr<linearOperator<hoNDArray<T> > > op;
+};
+
+template<class T> boost::shared_ptr<linearOperator<hoCuNDArray<T> > > to_hoCu(boost::shared_ptr<linearOperator<hoNDArray<T> > > _op){
+	return boost::shared_ptr<hoCuOperator<T> > (new hoCuOperator<T>(_op));
+}
+
+}
diff --git a/toolboxes/operators/gpu/hoCuTvOperator.h b/toolboxes/operators/gpu/hoCuTvOperator.h
new file mode 100644
index 0000000..600ae09
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuTvOperator.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include "hoCuNDArray_math.h"
+#include "generalOperator.h"
+#include "hoCuNDArray.h"
+#include "cuTvOperator.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+template<class T, size_t D> class EXPORTGPUOPERATORS hoCuTvOperator :
+public generalOperator< hoCuNDArray<T> >
+{
+
+protected:
+	typedef typename realType<T>::Type REAL;
+
+public:
+
+	hoCuTvOperator() : generalOperator< hoCuNDArray<T> >(){
+		limit_ = REAL(1e-8);
+		cuTV.set_limit(limit_);
+	}
+
+	virtual ~hoCuTvOperator(){}
+
+	void set_limit( REAL limit ){
+		limit_ = limit;
+		cuTV.set_limit(limit);
+	}
+
+	virtual void gradient( hoCuNDArray<T> *in, hoCuNDArray<T> *out, bool accumulate=false )
+	{
+		if (in->get_number_of_elements() != out->get_number_of_elements()){
+			throw std::runtime_error("hoCuTvOperator: input/output array dimensions mismatch");
+		}
+
+		const vector_td<size_t,D> dims = from_std_vector<size_t, D>(*(in->get_dimensions()));
+		int elements = in->get_number_of_elements();
+
+		for (int i=0; i < (elements/prod(dims)); i++){
+
+			std::vector<size_t> dimensions = to_std_vector(dims);
+
+			hoNDArray<T> tmp_in;
+			tmp_in.create(&dimensions,in->get_data_ptr()+i*prod(dims));
+
+			hoNDArray<T> tmp_out;
+			tmp_out.create(&dimensions,out->get_data_ptr()+i*prod(dims));
+
+			cuNDArray<T> cuIn(&tmp_in);
+			cuNDArray<T> cuOut(&tmp_out);
+
+			cuTV.gradient(&cuIn,&cuOut,accumulate);
+			boost::shared_ptr< hoNDArray<T> > tmp = cuOut.to_host();
+			tmp_out = *tmp;
+		}
+	}
+
+	virtual REAL magnitude( hoCuNDArray<T> *in)
+	{
+		const vector_td<size_t,D> dims = from_std_vector<size_t, D>(*(in->get_dimensions()));
+		int elements = in->get_number_of_elements();
+		REAL result = 0;
+		for (int i=0; i < (elements/prod(dims)); i++){
+			std::vector<size_t> dimensions = to_std_vector(dims);
+			hoNDArray<T> tmp_in;
+			tmp_in.create(&dimensions,in->get_data_ptr()+i*prod(dims));
+			cuNDArray<T> cuIn(&tmp_in);
+			result += cuTV.magnitude(&cuIn);
+		}
+		return result;
+	}
+
+	virtual void set_weight(REAL weight){
+		this->weight_ = weight;
+		cuTV.set_weight(weight);
+	}
+
+protected:
+	REAL limit_;
+	cuTvOperator<T,D> cuTV;
+};
+}
diff --git a/toolboxes/operators/gpu/hoCuTvPicsOperator.h b/toolboxes/operators/gpu/hoCuTvPicsOperator.h
new file mode 100644
index 0000000..35ad216
--- /dev/null
+++ b/toolboxes/operators/gpu/hoCuTvPicsOperator.h
@@ -0,0 +1,16 @@
+#pragma once
+
+
+#include "hoCuTvOperator.h"
+#include "tvPicsOperator.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class hoCuTvPicsOperator 
+    : public tvPicsOperator< hoCuNDArray<T>, hoCuTvOperator<T,D>, typename realType<T>::Type >
+  {
+  public:
+    hoCuTvPicsOperator() : tvPicsOperator< hoCuNDArray<T>, hoCuTvOperator<T,D>, typename realType<T>::Type >() {}
+    virtual ~hoCuTvPicsOperator() {}
+  };    
+}
diff --git a/toolboxes/operators/identityOperator.h b/toolboxes/operators/identityOperator.h
new file mode 100644
index 0000000..75cb546
--- /dev/null
+++ b/toolboxes/operators/identityOperator.h
@@ -0,0 +1,55 @@
+/** \file identityOperator.h
+    \brief Device independent implementation of the identity operator.
+
+    The file identityOperator.h is a device independent implementation of the identity operator.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoIdentityOperator(/.h) for a cpu instantiated operator using the hoNDArray class
+    - the class(/file) cuIdentityOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class identityOperator : public linearOperator<ARRAY_TYPE>
+  {
+  public:
+    
+    identityOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~identityOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( in == 0x0 || out == 0x0 ){
+	throw std::runtime_error("Error: identityOperator::mult_{M,MH,MHM}: illegal array pointer provided");
+      }
+
+      // We will do only the most basic dimensionality checking
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){
+	throw std::runtime_error("Error: identityOperator: in/out dimensions mismatch");
+      }
+        
+      if( accumulate )
+    	*out += *in;
+      else 
+	*out = *in;           
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      return mult_M(in, out, accumulate);
+    }
+    
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      return mult_M(in, out, accumulate);
+    }
+    
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }
+  };
+}
diff --git a/toolboxes/operators/imageOperator.h b/toolboxes/operators/imageOperator.h
new file mode 100644
index 0000000..bb3accc
--- /dev/null
+++ b/toolboxes/operators/imageOperator.h
@@ -0,0 +1,99 @@
+/** \file imageOperator.h
+    \brief Base class for the image regularization operators.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "GadgetronTimer.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE_REAL, class ARRAY_TYPE_OPERATOR> class imageOperator : public linearOperator<ARRAY_TYPE_OPERATOR>
+  {
+
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+    typedef typename ARRAY_TYPE_OPERATOR::element_type ELEMENT_TYPE;
+    
+  public:
+    
+    imageOperator() : linearOperator<ARRAY_TYPE_OPERATOR>(), offset_(REAL(0)) {}
+    virtual ~imageOperator() {}
+  
+    // Get regularization image
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> get() { return image_; }
+    
+    // Compute regularization image
+    virtual void compute( ARRAY_TYPE_OPERATOR *image, bool offset_estimation = true )
+    {
+      // Make temporary copy of input
+      ARRAY_TYPE_OPERATOR tmp(*image);
+
+      // Normalize to an average energy of "one intensity unit per image element"
+      REAL sum = asum( &tmp );
+      REAL scale = ( (REAL) tmp.get_number_of_elements()/sum );
+      tmp *= scale;
+
+      image_ =  abs(&tmp);
+
+      if( offset_estimation )
+	offset_ = estimate_offset();
+      
+      // Reciprocalize image
+      if(offset_ > REAL(0)) *image_ += offset_;      
+      reciprocal_inplace(image_.get());
+    }
+    
+    // Apply regularization image operator
+    virtual void mult_MH_M( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {        
+      ARRAY_TYPE_OPERATOR *tmp;
+      if( !accumulate ){
+    	tmp = out;
+    	*tmp = *in;
+      } 
+      else
+    	tmp = new ARRAY_TYPE_OPERATOR(*in);
+      
+      *tmp *= *image_;
+      *tmp *= *image_;
+      
+      if (accumulate){
+    	*out += *tmp;
+    	delete tmp;
+      }
+    }
+  
+    virtual void mult_M( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {
+      ARRAY_TYPE_OPERATOR *tmp;
+      if( !accumulate ){
+	tmp = out;
+	*tmp = *in;
+      } else
+	tmp = new ARRAY_TYPE_OPERATOR(*in);
+
+      *tmp *= *image_;
+
+      if (accumulate){
+	*out += *tmp;
+	delete tmp;
+      }
+    }
+  
+    virtual void mult_MH( ARRAY_TYPE_OPERATOR *in, ARRAY_TYPE_OPERATOR *out, bool accumulate = false )
+    {
+      mult_M(in,out,accumulate);
+    }
+
+  
+  protected:
+    // Estimate offset to the regularization image
+    virtual REAL estimate_offset()=0;
+
+  protected:
+    boost::shared_ptr< ARRAY_TYPE_REAL > image_;
+    REAL offset_;
+  };
+}
diff --git a/toolboxes/operators/laplaceOperator.h b/toolboxes/operators/laplaceOperator.h
new file mode 100644
index 0000000..92f8b0c
--- /dev/null
+++ b/toolboxes/operators/laplaceOperator.h
@@ -0,0 +1,31 @@
+/** \file laplaceOperator.h
+    \brief Base class for the Laplacian operator implementations.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+  
+  template <unsigned int D, class ARRAY_TYPE> class laplaceOperator : public linearOperator<ARRAY_TYPE>
+  {    
+  public:
+    
+    laplaceOperator( ) : linearOperator<ARRAY_TYPE>() { }
+    virtual ~laplaceOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_laplace( in, out, accumulate );
+    }
+  
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_laplace( in, out, accumulate );
+    }
+    
+  protected:
+    virtual void compute_laplace( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate ) = 0;
+  };
+}
diff --git a/toolboxes/operators/linearOperator.h b/toolboxes/operators/linearOperator.h
new file mode 100644
index 0000000..64195cc
--- /dev/null
+++ b/toolboxes/operators/linearOperator.h
@@ -0,0 +1,108 @@
+/** \file linearOperator.h
+    \brief Base class for all linear operators.
+*/
+
+#pragma once
+
+#include "generalOperator.h"
+
+namespace Gadgetron{
+
+  /** \class linearOperator
+      \brief Base class for all linear Operators
+  */
+  template <class ARRAY_TYPE> class linearOperator : public generalOperator<ARRAY_TYPE>
+  {
+  public:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+  linearOperator() : generalOperator<ARRAY_TYPE>() {}
+
+  linearOperator(std::vector<size_t> *dims) : generalOperator<ARRAY_TYPE>(dims) {
+      set_codomain_dimensions(dims);
+    }
+
+  linearOperator(std::vector<size_t> *dims, std::vector<size_t> *codims)
+    : generalOperator<ARRAY_TYPE>(dims) {
+      set_codomain_dimensions(codims);
+    }
+
+    virtual ~linearOperator() {}
+
+    /**
+     * The gradient of a linear operator corresponds to mult_MH_M, times the weight of the operator.
+     * @param[in] in Input array.
+     * @param[in,out] out Output Array.
+     * @param accumulate If true, adds result to out. If false, overwrites out.
+     */
+    virtual void gradient(ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false)
+    {
+      if( in == 0x0 || out == 0x0 )
+	throw std::runtime_error("linearOperator::gradient(): Invalid input and/or output array");
+
+      ARRAY_TYPE* tmp = out;
+      if (accumulate) {
+	tmp = new ARRAY_TYPE(out->get_dimensions());
+      }
+      mult_MH_M(in,tmp,false);
+      *tmp *= this->weight_;
+      if (accumulate){
+	*out += *tmp;
+	delete tmp;
+      }
+    }
+
+
+    virtual REAL magnitude(ARRAY_TYPE* in){
+      ARRAY_TYPE tmp(&this->codomain_dims_);
+      this->mult_M(in,&tmp);
+      return std::sqrt(this->get_weight())*real(dot(&tmp,&tmp));
+    }
+    virtual void set_codomain_dimensions( std::vector<size_t> *dims )
+    {
+      if( dims == 0x0 )
+	throw std::runtime_error("linearOperator::set_codomain_dimensions: illegal dimensions array provided");
+      codomain_dims_ = *dims;
+    }
+
+    virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions()
+      {
+	std::vector<size_t> *dims = new std::vector<size_t>();
+	*dims = codomain_dims_;
+	return boost::shared_ptr< std::vector<size_t> >(dims);
+      }
+
+    virtual void mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false) = 0;
+    virtual void mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false) = 0;
+
+    virtual void mult_MH_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+    {
+      if( codomain_dims_.size() == 0 ){
+	throw std::runtime_error("Error: linearOperator::mult_MH_M : codomain dimensions not set");
+      }
+
+      ARRAY_TYPE tmp;
+      tmp.create(&codomain_dims_);
+      mult_M( in, &tmp, false );
+      mult_MH( &tmp, out, accumulate );
+    }
+
+    virtual boost::shared_ptr< linearOperator<ARRAY_TYPE > > clone() = 0;
+
+  protected:
+
+    // The template below is useful for implementing the pure virtual 'clone' method
+    //
+
+    template <class T> static
+      boost::shared_ptr<T> clone( T *orig )
+      {
+	boost::shared_ptr<T> copy( new T() );
+	*copy = *orig;
+	return copy;
+      }
+
+  protected:
+    std::vector<size_t> codomain_dims_;
+  };
+}
diff --git a/toolboxes/operators/multiplicationOperatorContainer.h b/toolboxes/operators/multiplicationOperatorContainer.h
new file mode 100644
index 0000000..1e9951b
--- /dev/null
+++ b/toolboxes/operators/multiplicationOperatorContainer.h
@@ -0,0 +1,200 @@
+/** \file multiplicationOperatorContainer.h
+    \brief Operator used to chain together (concatenate) a series of operators by multiplication.
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include <iostream>
+#include <vector>
+
+template <class REAL, class ARRAY_TYPE> class multiplicationOperatorContainer
+  : public linearOperator<REAL, ARRAY_TYPE>
+{
+public:
+  multiplicationOperatorContainer() : linearOperator<REAL,ARRAY_TYPE>() {}
+  virtual ~multiplicationOperatorContainer(){}
+
+  // Set/get domain and codomain dimensions. 
+  //
+
+  virtual bool set_domain_dimensions( std::vector<unsigned int>* )
+  { 
+    std::cerr << "Warning: multiplicationOperatorContainer::set_domain_dimensions : dimensions ignored, using dimensions of the individual operators instead" << std::endl;
+    return false;
+  }  
+
+  virtual bool set_codomain_dimensions( std::vector<unsigned int> *dims ) 
+  { 
+    std::cerr << "Warning: multiplicationOperatorContainer::set_codomain_dimensions : dimensions ignored, using dimensions of the individual operators instead" << std::endl;
+    return false;
+  }
+
+  virtual boost::shared_ptr< std::vector<unsigned int> > get_domain_dimensions() 
+  { 
+    if( operators_.size() == 0 )
+      return boost::shared_ptr< std::vector<unsigned int> >();
+    else
+      return operators_[0]->get_domain_dimensions();
+  }
+
+  virtual boost::shared_ptr< std::vector<unsigned int> > get_codomain_dimensions() 
+  { 
+    if( operators_.size() == 0 )
+      return boost::shared_ptr< std::vector<unsigned int> >();
+    else
+      return operators_[operators_.size()-1]->get_codomain_dimensions();
+  }
+  
+  virtual void set_weight( REAL weight ){ 
+    REAL op_weight = REAL(1);
+    for( int i=0; i<operators_.size(); i++ )
+      op_weight *= operators_[i]->get_weight();
+    this->weight_ = weight*op_weight;
+  }
+
+  // Add operator to the container
+  //
+  bool add_operator( boost::shared_ptr< linearOperator<REAL, ARRAY_TYPE> > op )
+  {
+    if( op.get() == 0x0 ){
+      std::cerr << "Error: multiplicationOperatorContainer::add_operator : illegal operator" << std::endl;
+      return false;
+    } 
+
+    // All operators needs the domain and codomain dimensions set
+    //
+    if( op->get_domain_dimensions()->size() == 0 ){
+      std::cerr << "Error: multiplicationOperatorContainer::add_operator : domain dimensions not set on operator" << std::endl;
+      return false;
+    }
+    if( op->get_codomain_dimensions()->size() == 0 ){
+      std::cerr << "Error: multiplicationOperatorContainer::add_operator : codomain dimensions not set on operator" << std::endl;
+      return false;
+    }
+
+    if( operators_.size() == 0 && !_set_domain_dimensions( op->get_domain_dimensions().get() ) ){
+      std::cerr << "Error: multiplicationOperatorContainer::add_operator : failed to set domain dimensions on container" << std::endl;
+      return false;
+    }
+
+    if( !_set_codomain_dimensions( op->get_codomain_dimensions().get() ) ){
+      std::cerr << "Error: multiplicationOperatorContainer::add_operator : failed to set codomain dimensions on container" << std::endl;
+      return false;
+    }
+    
+    operators_.push_back( op );
+    this->weight_ *= op->get_weight();
+
+    return true;
+  }
+  
+  virtual int mult_M( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+  {
+    if( operators_.size() == 0 ){
+      std::cerr << "Error: multiplicationOperatorContainer::mult_M : no operators added" << std::endl;
+      return -1;
+    }
+    
+    ARRAY_TYPE *tmp_in = in, *tmp_out = 0x0;
+    ARRAY_TYPE ping, pong;
+
+    if( operators_.size() > 1 ){
+      if( !ping.create( operators_[0]->get_codomain_dimensions().get() )){
+	std::cerr << "Error: multiplicationOperatorContainer::mult_M : failed to create intermediate array (1)" << std::endl;
+	return -1;
+      }
+      tmp_out = &ping;
+    }
+    else{
+      tmp_out = out;
+    }
+    
+    // Loop over operators
+    //
+    for( int i=0; i < operators_.size(); i++ ){
+      
+      if( operators_[i]->mult_M( tmp_in, tmp_out, (i==operators_.size()-1) ? accumulate : false ) < 0 ){
+	std::cerr << "Error: multiplicationOperatorContainer : mult_M failed on sub-operator" << std::endl;
+	return -1;
+      }
+      
+      ARRAY_TYPE *tmp_tmp_out = (i==0) ? &pong : tmp_in;
+      tmp_in = tmp_out;
+
+      if( operators_.size() > 2 && i < operators_.size()-2 ){
+	if( !tmp_tmp_out->create( operators_[i+1]->get_codomain_dimensions().get() )){
+	  std::cerr << "Error: multiplicationOperatorContainer::mult_M : failed to create intermediate array (2)" << std::endl;
+	  return -1;
+	}
+	tmp_out = tmp_tmp_out;
+      }
+      else if( i == operators_.size()-2 ){
+	tmp_out = out;
+      }      
+    }
+    return 0;
+  }
+
+  virtual int mult_MH( ARRAY_TYPE* in, ARRAY_TYPE* out, bool accumulate = false )
+  {
+    if( operators_.size() == 0 ){
+      std::cerr << "Error: multiplicationOperatorContainer::mult_MH : no operators added" << std::endl;
+      return -1;
+    }
+    
+    ARRAY_TYPE *tmp_in = in, *tmp_out = 0x0;
+    ARRAY_TYPE ping, pong;
+    
+    if( operators_.size() > 1 ){
+      if( !ping.create( operators_[operators_.size()-1]->get_domain_dimensions().get() )){
+	std::cerr << "Error: multiplicationOperatorContainer::mult_MH : failed to create intermediate array (1)" << std::endl;
+	return -1;
+      }
+      tmp_out = &ping;
+    }
+    else{
+      tmp_out = out;
+    }
+    
+    // Loop over operators
+    //
+    for( int i=operators_.size()-1; i>=0; i-- ){
+      
+      if( operators_[i]->mult_MH( tmp_in, tmp_out, (i==0) ? accumulate : false ) < 0 ){
+	std::cerr << "Error: multiplicationOperatorContainer : mult_MH failed on sub-operator" << std::endl;
+	return -1;
+      }
+      
+      ARRAY_TYPE *tmp_tmp_out = (i==operators_.size()-1) ? &pong : tmp_in;
+      tmp_in = tmp_out;
+      
+      if( i > 1 ){
+	if( !tmp_tmp_out->create( operators_[i-1]->get_domain_dimensions().get() )){
+	  std::cerr << "Error: multiplicationOperatorContainer::mult_MH : failed to create intermediate array (2)" << std::endl;
+	  return -1;
+	}
+	tmp_out = tmp_tmp_out;
+      }
+      else if( i == 1 ){
+	tmp_out = out;
+      }      
+    }
+    return 0;
+  }
+
+protected:
+
+  virtual bool _set_domain_dimensions( std::vector<unsigned int> *dims )
+  { 
+    return linearOperator<REAL, ARRAY_TYPE>::set_domain_dimensions( dims );
+  }  
+
+  virtual bool _set_codomain_dimensions( std::vector<unsigned int> *dims )
+  { 
+    return linearOperator<REAL, ARRAY_TYPE>::set_codomain_dimensions( dims );
+  }  
+  
+protected:
+  std::vector< boost::shared_ptr< linearOperator<REAL, ARRAY_TYPE> > > operators_;
+};
diff --git a/toolboxes/operators/partialDerivativeOperator.h b/toolboxes/operators/partialDerivativeOperator.h
new file mode 100644
index 0000000..bfae587
--- /dev/null
+++ b/toolboxes/operators/partialDerivativeOperator.h
@@ -0,0 +1,71 @@
+/** \file partialDerivativeOperator.h
+    \brief Base class for the partialDerivative operators.
+
+    The file partialDerivativeOperator.h is a device independent partial implementation 
+    of a partial derivative operator.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoPartialDerivativeOperator(/.h) for a cpu instantiated operator using the hoNDArray class
+    - the class(/file) cuPartialDerivativeOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+
+namespace Gadgetron{
+  
+  template < unsigned int D, class ARRAY_TYPE> class partialDerivativeOperator 
+    : public linearOperator<ARRAY_TYPE>
+  {
+    
+  public:
+    
+    partialDerivativeOperator( size_t dimension ) : 
+      linearOperator<ARRAY_TYPE>() { compute_stride(dimension); }
+    
+    virtual ~partialDerivativeOperator() {}
+    
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_partial_derivative( forwards_stride_, in, out, accumulate );
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      compute_partial_derivative( adjoint_stride_, in, out, accumulate );
+    }
+    
+    virtual void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {    
+      compute_second_order_partial_derivative( forwards_stride_, adjoint_stride_, in, out, accumulate );
+    }
+    
+    virtual void compute_partial_derivative
+    ( typename int64d<D>::Type stride, 
+      ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate ) = 0;  
+    
+    virtual void compute_second_order_partial_derivative
+    ( typename int64d<D>::Type forwards_stride, typename int64d<D>::Type adjoint_stride, 
+      ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate ) = 0;    
+    
+  protected:
+    
+    virtual void compute_stride( size_t _dimension )
+    {
+      size_t dim = _dimension;
+      
+      if( _dimension > D-1 ){
+        throw std::runtime_error("Error: partialDerivativeOperator: dimension out of range");
+      }
+      
+      for( unsigned int d=0; d<D; d++ ){
+        forwards_stride_.vec[d] = (d==dim) ? 1 : 0;
+        adjoint_stride_.vec[d] = (d==dim) ? -1 : 0;
+      }    
+    }
+    
+  private:
+    typename int64d<D>::Type forwards_stride_;
+    typename int64d<D>::Type adjoint_stride_;
+  };
+}
diff --git a/toolboxes/operators/tvPicsOperator.h b/toolboxes/operators/tvPicsOperator.h
new file mode 100644
index 0000000..5d490d6
--- /dev/null
+++ b/toolboxes/operators/tvPicsOperator.h
@@ -0,0 +1,44 @@
+#pragma once
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE, class TV_OPERATOR, class REAL> class tvPicsOperator 
+    : public generalOperator<ARRAY_TYPE>
+  {
+  public:
+    
+    tvPicsOperator() : generalOperator<ARRAY_TYPE>() {}
+    virtual ~tvPicsOperator() {}
+
+    void set_prior(boost::shared_ptr<ARRAY_TYPE> prior){
+      prior_ = prior;
+    }
+
+    virtual void gradient(ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate){
+      ARRAY_TYPE tmp = *in;
+      tmp -= *prior_;
+      op_.gradient(&tmp, out, accumulate);
+    }
+
+    virtual REAL magnitude(ARRAY_TYPE *x){
+    	ARRAY_TYPE tmp = *x;
+    	tmp -= *prior_;
+    	return op_.magnitude(&tmp);
+    }
+    void set_limit(REAL limit){
+      op_.set_limit(limit);
+    }
+
+    virtual void set_weight(REAL weight){
+      op_.set_weight(weight);
+    }
+
+    virtual REAL get_weight(){
+      return op_.get_weight();
+    }
+
+  protected:
+    TV_OPERATOR op_;
+    boost::shared_ptr<ARRAY_TYPE> prior_;
+  };
+}
diff --git a/toolboxes/operators/upsampleOperator.h b/toolboxes/operators/upsampleOperator.h
new file mode 100644
index 0000000..6576891
--- /dev/null
+++ b/toolboxes/operators/upsampleOperator.h
@@ -0,0 +1,51 @@
+/** \file UpsamplingOperator.h
+    \brief Base class for the upsampling operators.
+
+    For instantiation we refer to
+    - the class(/file) cuUpsamplingOperator(/.h) for a gpu instantiated operator using the cuNDArray class
+*/
+
+#pragma once
+
+#include "linearOperator.h"
+#include "vector_td.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE, unsigned int D> class upsampleOperator
+    : public linearOperator<ARRAY_TYPE>
+  {
+    
+  public:
+    
+    upsampleOperator() : linearOperator<ARRAY_TYPE>() {}
+    virtual ~upsampleOperator() {}
+    
+    typedef typename ARRAY_TYPE::element_type T;
+
+    virtual void mult_M( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = upsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        upsample<T,D>(in,out);
+    }
+    
+    virtual void mult_MH( ARRAY_TYPE *in, ARRAY_TYPE *out, bool accumulate = false )
+    {
+      if( accumulate ){
+        boost::shared_ptr<ARRAY_TYPE> tmp = downsample<T,D>(in);
+        *out += *tmp;
+      }
+      else
+        downsample<T,D>(in,out);
+    }
+
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE > > clone()
+    {
+      return linearOperator<ARRAY_TYPE>::clone(this);
+    }    
+  };
+}
diff --git a/toolboxes/registration/CMakeLists.txt b/toolboxes/registration/CMakeLists.txt
new file mode 100644
index 0000000..b19925d
--- /dev/null
+++ b/toolboxes/registration/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(optical_flow)
diff --git a/toolboxes/registration/optical_flow/CMakeLists.txt b/toolboxes/registration/optical_flow/CMakeLists.txt
new file mode 100644
index 0000000..6e02a83
--- /dev/null
+++ b/toolboxes/registration/optical_flow/CMakeLists.txt
@@ -0,0 +1,29 @@
+include_directories(   
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+)
+
+install(FILES
+  registrationSolver.h
+  multiresRegistrationSolver.h
+  opticalFlowSolver.h 
+  resampleOperator.h
+  opticalFlowOperator.h 
+  DESTINATION include)
+
+if(ARMADILLO_FOUND)
+  if(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+    message("Compiling cpu based optical flow registration toolbox.")
+    add_subdirectory(cpu)
+  else (${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+    message("Armadillo (at least version 3.820) not found. Not compiling cpu-based optical flow registration toolbox. ")  
+  endif(${ARMADILLO_VERSION_STRING} VERSION_GREATER "3.819" )
+endif(ARMADILLO_FOUND)
+
+if (CUDA_FOUND)
+  message("Compiling gpu based optical flow registration toolbox.")
+  add_subdirectory(gpu)
+endif (CUDA_FOUND)
diff --git a/toolboxes/registration/optical_flow/cpu/CMakeLists.txt b/toolboxes/registration/optical_flow/cpu/CMakeLists.txt
new file mode 100644
index 0000000..219fa2b
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/CMakeLists.txt
@@ -0,0 +1,35 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUREG__)
+endif (WIN32)
+
+include_directories(   
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu/arma_math
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${ARMADILLO_INCLUDE_DIR}
+)
+
+add_library(cpureg ${LIBTYPE} 
+  hoOpticalFlowSolver.cpp
+  hoHSOpticalFlowSolver.cpp
+  hoCKOpticalFlowSolver.cpp
+  hoLinearResampleOperator.cpp
+  hoRegistration_utils.cpp
+  )
+
+target_link_libraries(cpureg 
+  cpucore
+  cpucore_math 
+  ${ARMADILLO_LIBRARIES}
+  )
+
+install(TARGETS cpureg DESTINATION lib)
+
+install(FILES
+  hoOpticalFlowSolver.h
+  hoHSOpticalFlowSolver.h
+  hoCKOpticalFlowSolver.h
+  hoRegistration_utils.h
+  cpureg_export.h
+  DESTINATION include)
+
+install(FILES hoLinearResampleOperator.h DESTINATION include)
diff --git a/toolboxes/registration/optical_flow/cpu/cpureg_export.h b/toolboxes/registration/optical_flow/cpu/cpureg_export.h
new file mode 100644
index 0000000..3dfd6fd
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/cpureg_export.h
@@ -0,0 +1,18 @@
+#ifndef _CPUREG_EXPORT_H_
+#define _CPUREG_EXPORT_H_
+
+#if defined (WIN32)
+    #ifdef BUILD_TOOLBOX_STATIC
+        #define EXPORTCPUREG
+    #else
+        #if defined (__BUILD_GADGETRON_CPUREG__) || defined (cpureg_EXPORTS)
+            #define EXPORTCPUREG __declspec(dllexport)
+        #else
+            #define EXPORTCPUREG __declspec(dllimport)
+        #endif
+    #endif
+#else
+#define EXPORTCPUREG
+#endif
+
+#endif /* _CPUREG_EXPORT_H_ */
diff --git a/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.cpp b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.cpp
new file mode 100644
index 0000000..d9d60cb
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.cpp
@@ -0,0 +1,297 @@
+#include "hoCKOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+  // Helpers
+  //
+  
+  template<unsigned int D> inline bool
+  is_border_pixel_for_stride( typename int64d<D>::Type stride, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    for( size_t d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+	if( co.vec[d] == 0 ){
+	  return true;
+	}
+      }
+      else if( stride.vec[d] == 1 ){
+	if( co.vec[d] == (dims.vec[d]-1) ){
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+
+  template<size_t i, size_t j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <size_t i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+  
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  hoCKOpticalFlowSolver<T,D>::core_solver( hoNDArray<T> *_gradient_image, hoNDArray<T> *_stencil_image )
+  {
+    // Sanity checks
+    //
+  
+    if( !_gradient_image ){
+      throw std::runtime_error("hoCKOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+
+    if( _gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("hoCKOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+  
+    // The dimensions of the displacement field should match the gradient field
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = _gradient_image->get_dimensions();
+    boost::shared_ptr< hoNDArray<T> > displacements_ping( new hoNDArray<T>(disp_dims.get()) );
+    boost::shared_ptr< hoNDArray<T> > displacements_pong( new hoNDArray<T>(disp_dims.get()) );
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+    
+    // We use "shared memory" to hold the averaged displacements
+    boost::shared_ptr< hoNDArray<T> > _shared_mem(new hoNDArray<T>(disp_dims.get()));
+    T *shared_mem = _shared_mem->get_data_ptr();
+    clear( _shared_mem.get());
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *disp_dims );  
+    size_t number_of_elements = prod(matrix_size);
+    size_t num_batches = 1;
+
+    for( size_t d=D; d<_gradient_image->get_number_of_dimensions()-1; d++ ){
+      num_batches *= _gradient_image->get_size(d);
+    }
+  
+    // Get ready
+    // 
+
+    size_t iteration_no = 0;
+    hoNDArray<T> *ping = displacements_ping.get();
+    hoNDArray<T> *pong = displacements_pong.get(); 
+
+    if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      std::cout << std::endl;
+    }
+
+    //
+    // Main Jacobi loop
+    //
+    
+    while(true){
+    
+      if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	std::cout << "."; std::cout.flush();
+      }
+    
+      // Continuation flag used for early Jacobi termination
+      size_t continue_flag = 0;
+
+      // Number of elements per batch
+      const size_t num_elements_per_batch = prod(matrix_size);
+  
+      // Number of elements per dim
+      const size_t num_elements_per_dim = num_elements_per_batch*num_batches;
+
+      T *in_disp = ping->get_data_ptr();
+      T *out_disp = pong->get_data_ptr();
+      T *gradient_image = _gradient_image->get_data_ptr();
+      T *stencil_image = (_stencil_image) ? _stencil_image->get_data_ptr() : 0x0;
+
+      //
+      // Find the average velocities (shared memory)
+      //
+      
+      for( size_t dim = 0; dim < D+1; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+	for( long long idx = 0; idx < (long long)num_elements_per_dim; idx++ ){
+	  	  
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+	  
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  // Local co to the image
+	  const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );    
+	  const typename int64d<D>::Type zeros(0);
+	  const typename int64d<D>::Type ones(1);
+	  const typename int64d<D>::Type threes(3);
+	  
+	  const int num_neighbors = Pow<3,D>::Value;
+	  T num_contribs = T(0);
+	  
+	  shared_mem[shared_idx] = T(0);
+	  
+	  // Compute average of neighbors
+	  //
+	  
+	  for( long long i=0; i<num_neighbors; i++ ){
+	    
+	    // Find the stride of the neighbor {-1, 0, 1}^D
+	    const typename int64d<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+	    
+	    size_t neighbor_idx;
+	    
+	    const size_t base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+	    
+	    // Verify that the neighbor is not out of bounds (and not the thread itself)
+	    if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){	
+	      neighbor_idx = (size_t) co_to_idx<D>( vector_td<long long,D>(co)+stride, vector_td<long long,D>(matrix_size)) + base_offset;
+	    }
+	    else{
+	      neighbor_idx = idx_in_batch + base_offset;
+	    }
+	    
+	    shared_mem[shared_idx] += in_disp[neighbor_idx];
+	    num_contribs += T(1);
+	  }
+	  
+	  // Normalize
+	  shared_mem[shared_idx] /= num_contribs;
+	}
+      }
+      
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+      
+      const T disp_thresh_sqr = this->limit_*this->limit_;
+
+      for( size_t dim = 0; dim < D+1; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+	for( long long idx = 0; idx < num_elements_per_dim; idx++ ){  
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+	  
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  T phi = T(0);
+	  T norm = T(0);
+	  
+	  typename reald<T,D>::Type derivatives;
+	  
+	  // Contributions from the spatial dimensions
+	  //
+	  
+	  for( size_t d=0; d<D; d++ ){
+	    derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+	    const size_t shared_idx_d = d*num_elements_per_dim+idx;
+	    phi += (shared_mem[shared_idx_d]*derivatives.vec[d]);
+	    norm += (derivatives.vec[d]*derivatives.vec[d]);
+	  }
+	  
+	  // Contributions from the temporal dimension
+	  //
+	  
+	  phi += gradient_image[D*num_elements_per_dim+idx];
+	  
+	  // Contribution from the intensity attentuation estimation
+	  //
+	  
+	  phi -= shared_mem[D*num_elements_per_dim+idx];
+	  
+	  // Normalize
+	  //
+	  
+	  phi /= ((alpha_/beta_)*(alpha_/beta_)+alpha_*alpha_+norm);
+	  
+	  // Form result displacement
+	  //
+	  
+	  T result;
+	  
+	  if( dim<D )
+	    result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+	  else
+	    result = shared_mem[D*num_elements_per_dim+idx]+(alpha_/beta_)*(alpha_/beta_)*phi;
+	  
+	  // Clear the "termination" flag if the displacement field has changed above the threshold
+	  //
+	  
+	  T delta = result-in_disp[dim*num_elements_per_dim+idx];
+	  if( dim < D && delta*delta > disp_thresh_sqr )
+	    continue_flag = 1;
+	  
+	  // Output result
+	  //
+	  
+	  out_disp[dim*num_elements_per_dim+idx] = result;
+	}
+      }
+      
+      // Swap in/out buffers
+      //
+      
+      hoNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+
+      // Check termination criteria
+      //
+      
+      if( continue_flag == 0 ){
+	if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	  std::cout << std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl;
+	}
+	break;
+      }
+      
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+	break;    
+      
+      iteration_no++;
+    }
+    
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+  
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,1>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,2>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,3>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<float,4>;
+
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,1>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,2>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,3>;
+  template class EXPORTCPUREG hoCKOpticalFlowSolver<double,4>;  
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.h b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.h
new file mode 100644
index 0000000..0db9689
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoCKOpticalFlowSolver.h
@@ -0,0 +1,55 @@
+/** \file hoCKOpticalFlowSolver.h
+    \brief CPU-based Cornelius-Kanade optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "hoOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTCPUREG hoCKOpticalFlowSolver 
+    : public hoOpticalFlowSolver<T, D>
+  {
+  
+  public:
+
+    // Constructors / destructors
+    //
+  
+    hoCKOpticalFlowSolver() : hoOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.05); 
+      beta_ = T(1.0); 
+    } 
+  
+    virtual ~hoCKOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+    inline void set_beta( T beta ) { beta_ = beta; }
+  
+  protected:  
+    virtual boost::shared_ptr< hoNDArray<T> > 
+      core_solver( hoNDArray<T> *gradient_image, hoNDArray<T> *stencil );  
+    
+  protected:
+    T alpha_;
+    T beta_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.cpp b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.cpp
new file mode 100644
index 0000000..7350237
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.cpp
@@ -0,0 +1,286 @@
+#include "hoHSOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  // Helpers
+  //
+  
+  template<unsigned int D> inline bool
+  is_border_pixel_for_stride( typename int64d<D>::Type stride, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    for( size_t d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+	if( co.vec[d] == 0 ){
+	  return true;
+	}
+      }
+      else if( stride.vec[d] == 1 ){
+	if( co.vec[d] == (dims.vec[d]-1) ){
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+  
+  template<size_t i, size_t j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <size_t i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< hoNDArray<T> >
+  hoHSOpticalFlowSolver<T,D>::core_solver( hoNDArray<T> *_gradient_image, hoNDArray<T> *_stencil_image )
+  {
+    // Sanity checks
+    //
+  
+    if( !_gradient_image ){
+      throw std::runtime_error("hoHSOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+  
+    if( _gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("hoHSOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+    
+    // The dimensions of the displacement field should match the gradient field
+    // - when removing the temporal gradient component (replacing D+1 with D)
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = _gradient_image->get_dimensions();
+    disp_dims->pop_back(); disp_dims->push_back(D);
+
+    boost::shared_ptr< hoNDArray<T> > displacements_ping(new hoNDArray<T>(disp_dims.get()));
+    boost::shared_ptr< hoNDArray<T> > displacements_pong(new hoNDArray<T>(disp_dims.get()));
+  
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+
+    // We use "shared memory" to hold the averaged displacements
+    boost::shared_ptr< hoNDArray<T> > _shared_mem(new hoNDArray<T>(disp_dims.get()));
+    T *shared_mem = _shared_mem->get_data_ptr();
+    clear( _shared_mem.get());
+   
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *_gradient_image->get_dimensions() );  
+    size_t number_of_elements = prod(matrix_size);
+    size_t num_batches = 1;
+    
+    for( size_t d=D; d<_gradient_image->get_number_of_dimensions()-1; d++ ){
+      num_batches *= _gradient_image->get_size(d);
+    }
+    
+    // Get ready...
+    //
+
+    size_t iteration_no = 0;
+    hoNDArray<T> *ping = displacements_ping.get();
+    hoNDArray<T> *pong = displacements_pong.get();
+
+    if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      std::cout << std::endl;
+    }
+
+    //
+    // Main Jacobi loop
+    //
+
+    while(true){
+    
+      if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	std::cout << "."; std::cout.flush();
+      }
+    
+      // Continuation flag used for early Jacobi termination      
+      size_t continue_flag = 0;
+
+      // Number of elements per batch
+      const size_t num_elements_per_batch = prod(matrix_size);
+      
+      // Number of elements per dim
+      const size_t num_elements_per_dim = num_elements_per_batch*num_batches;
+      
+      T *in_disp = ping->get_data_ptr();
+      T *out_disp = pong->get_data_ptr();
+      T *gradient_image = _gradient_image->get_data_ptr();
+      T *stencil_image = (_stencil_image) ? _stencil_image->get_data_ptr() : 0x0;
+
+      //
+      // Find the average velocities (shared memory)
+      //
+      
+      for( size_t dim = 0; dim < D; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx = 0; idx < num_elements_per_dim; idx++ ){
+	  
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+	  
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  // Local co to the image
+	  const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );	  
+	  const typename int64d<D>::Type zeros(0);
+	  const typename int64d<D>::Type ones(1);
+	  const typename int64d<D>::Type threes(3);
+	  
+	  const long long num_neighbors = Pow<3,D>::Value;
+	  T num_contribs = T(0);
+      	  
+	  for( long long i=0; i<num_neighbors; i++ ){
+	    
+	    // Find the stride of the neighbor {-1, 0, 1}^D
+	    const typename int64d<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+	    
+	    // Verify that the neighbor is not out of bounds (and not the thread itself)
+	    if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){
+	  
+	      // Compute average of neighbors
+	      //
+	      
+	      const size_t base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+	      const size_t neighbor_idx = (size_t) co_to_idx<D>( vector_td<long long,D>(co)+stride, vector_td<long long,D>(matrix_size)) + base_offset;
+	  
+	      shared_mem[shared_idx] += in_disp[neighbor_idx];
+	      num_contribs += T(1);
+	    }
+	  }
+      
+	  // Normalize
+	  shared_mem[shared_idx] /= num_contribs;       	
+	}
+      }
+      
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+      
+      const T disp_thresh_sqr = this->limit_*this->limit_;
+      
+      for( size_t dim = 0; dim < D; dim++ ){
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( long long idx = 0; idx < num_elements_per_dim; idx++ ){
+
+	  // Batch idx (second slowest varying dimension)   
+	  const size_t batch_idx = idx/num_elements_per_batch;
+	  
+	  // Local index to the image (or batch in our terminology)
+	  const size_t idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    	  
+	  if( stencil_image && stencil_image[idx_in_batch] > T(0) )
+	    continue;
+
+	  // Index to the shared memory
+	  const size_t shared_idx = dim*num_elements_per_dim+idx;
+
+	  T phi = T(0);
+	  T norm = T(0);
+	  
+	  typename reald<T,D>::Type derivatives;
+	  
+	  // Contributions from the spatial dimensions
+	  //
+	  
+	  for( size_t d=0; d<D; d++ ){
+	    derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+	    const size_t shared_idx_d = d*num_elements_per_dim+idx;
+	    phi += (shared_mem[shared_idx_d]*derivatives.vec[d]);
+	    norm += (derivatives.vec[d]*derivatives.vec[d]);
+	  }
+	  
+	  // Contributions from the temporal dimension
+	  //
+	  
+	  phi += gradient_image[D*num_elements_per_dim+idx];
+	  
+	  // Normalize
+	  //
+	  
+	  phi /= (alpha_*alpha_+norm);
+	  
+	  // Form result displacement
+	  //
+	  
+	  T result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+	  
+	  // Clear the "termination" flag if the displacement field has changed above the threshold
+	  //
+	  
+	  T delta = result-in_disp[dim*num_elements_per_dim+idx];
+	  if( delta*delta > disp_thresh_sqr )
+	    continue_flag = 1;
+	  
+	  // Output result
+	  //
+	  
+	  out_disp[dim*num_elements_per_dim+idx] = result;
+	}
+      }
+      
+      // Swap in/out buffers
+      //
+      
+      hoNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+      
+      // Check termination criteria
+      //
+
+      if( continue_flag == 0 ){
+	if( this->output_mode_ >= hoOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+	  std::cout << std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl;
+	}
+	break;
+      }
+    
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+	break;    
+      
+      iteration_no++;
+    }
+  
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+     
+  // 
+  // Template instantiation
+  //
+  
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,1>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,2>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,3>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<float,4>;
+  
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,1>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,2>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,3>;
+  template class EXPORTCPUREG hoHSOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.h b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.h
new file mode 100644
index 0000000..ab1df40
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoHSOpticalFlowSolver.h
@@ -0,0 +1,52 @@
+/** \file hoHSOpticalFlowSolver.h
+    \brief CPU-based Horn-Schunck optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "hoOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTCPUREG hoHSOpticalFlowSolver 
+    : public hoOpticalFlowSolver<T, D>
+  {
+    
+  public:
+
+    // Constructors / destructors
+    //
+  
+    hoHSOpticalFlowSolver() : hoOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.1); 
+    } 
+  
+    virtual ~hoHSOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+  
+  protected:  
+    virtual boost::shared_ptr< hoNDArray<T> > 
+      core_solver( hoNDArray<T> *gradient_image, hoNDArray<T> *stencil_image );
+    
+  protected:
+    T alpha_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.cpp b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.cpp
new file mode 100644
index 0000000..f2d3a9d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.cpp
@@ -0,0 +1,203 @@
+#include "hoLinearResampleOperator.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+#include "hoArmadillo.h"
+
+#include <stdio.h>
+#include <cmath>
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  void hoLinearResampleOperator<T,D>::mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+  {
+    if( !this->preprocessed_ ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+    }
+  
+    if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+    }
+  
+    arma::Row<typename stdType<T>::Type > in_vec = as_arma_row(in);
+    arma::Row<typename stdType<T>::Type > out_vec = as_arma_row(out);
+    out_vec = in_vec*R_T_;
+  }
+
+  template <class T, unsigned int D>
+  void hoLinearResampleOperator<T,D>::mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+  {
+    if( !this->preprocessed_ ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+    }
+  
+    if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+      throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+    }
+
+    arma::Col<typename stdType<T>::Type > in_vec = as_arma_col(in);
+    arma::Col<typename stdType<T>::Type > out_vec = as_arma_col(out);
+    out_vec = R_T_ * in_vec;
+  }
+  
+  template <class T, unsigned int D>
+  void hoLinearResampleOperator<T,D>::reset()
+  {
+    R_T_.reset();
+    resampleOperator< hoNDArray<typename realType<T>::Type>, hoNDArray<T> >::reset();
+  }
+  
+  template <class T, unsigned int D> void
+  hoLinearResampleOperator<T,D>::set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > displacements )
+  {
+    typedef typename realType<T>::Type REAL;
+    this->preprocessed_ = false;
+
+    if( displacements.get() == 0x0 ){
+      throw std::runtime_error("hoLinearResampleOperator::set_displacement_field : displacements ptr is 0x0." );
+    }  
+  
+    const int surplus = displacements->get_number_of_dimensions()-D;
+  
+    if( !( surplus == 1 || surplus == 2 ) ){
+      throw std::runtime_error("hoLinearResampleOperator::set_displacement_field : unexpected array dimensionality." );
+    }  
+  
+    // Determine the number of registrations performed
+    const unsigned int extended_dim = (surplus == 1) ? 1 : displacements->get_size(D); 
+    const unsigned int field_dim = (surplus == 1) ? displacements->get_size(D) : displacements->get_size(D+1);
+
+    if( !(field_dim == D || field_dim == D+1 )){
+      throw std::runtime_error("hoLinearResampleOperator::set_displacement_field : illegal tailing array dim" );
+    }
+  
+    const typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *(displacements->get_dimensions()));
+    const size_t num_elements_mat = prod(matrix_size);
+    const size_t num_elements_ext = prod(matrix_size)*extended_dim;
+    
+    const unsigned int num_neighbors = this->get_num_neighbors();
+    arma::umat locations(2,num_elements_ext*num_neighbors);
+    arma::Col<typename realType<T>::Type > values(num_elements_ext*num_neighbors);
+    size_t location_index = 0;
+
+    for( size_t idx=0; idx<num_elements_ext; idx++ ){
+    
+      const size_t batch_no = idx/num_elements_mat;
+      const size_t idx_in_batch = idx-batch_no*num_elements_mat;
+    
+      const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co);
+      for( unsigned int dim=0; dim<D; dim++ ){
+        REAL tmp = displacements->get_data_ptr()[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch];
+        co_disp.vec[dim] += tmp;
+      } 
+    
+      // Determine the number of neighbors
+      //
+    
+      const typename uint64d<D>::Type twos(2);
+    
+      // Weights are non-zero only if all neighbors exist
+      //
+    
+      if( this->is_border_pixel(co_disp, matrix_size) )
+        continue;
+    
+      // Iterate over all neighbors
+      //
+    
+      size_t mat_j = idx;
+      size_t mat_i;
+    
+      for( unsigned int i=0; i<num_neighbors; i++ ){
+      
+        // Determine image coordinate of current neighbor
+        //
+        
+        const typename uint64d<D>::Type stride = idx_to_co<D>( i, twos );
+        
+        if( weak_greater_equal( stride, matrix_size ) ) continue; // For dimensions of size 1
+        
+        typename reald<REAL,D>::Type co_stride;
+      
+        for( unsigned int dim=0; dim<D; dim++ ){
+          if( stride.vec[dim] == 0 ){
+            co_stride.vec[dim] = std::floor(co_disp.vec[dim]);
+          }
+          else{
+            co_stride.vec[dim] = std::ceil(co_disp.vec[dim]);
+            if( co_stride.vec[dim] == co_disp.vec[dim] )
+              co_stride.vec[dim] += REAL(1.0);
+          }
+        }
+
+        // Validate that the coordinate is within the expected range
+        //
+
+        typename uint64d<D>::Type ones(1);
+        typename uint64d<D>::Type co_stride_uint64d = vector_td<size_t,D>(co_stride);
+
+        if( weak_greater( co_stride_uint64d, matrix_size-ones ) ){
+
+          for( unsigned int dim=0; dim<D; dim++ ){
+            if( co_stride[dim] < REAL(0) )
+              co_stride_uint64d[dim] = 0;
+            if( co_stride[dim] > (REAL(matrix_size[dim])-REAL(1)) )
+              co_stride_uint64d[dim] = matrix_size[dim]-1;
+          }
+        }
+	
+        mat_i = co_to_idx<D>(co_stride_uint64d, matrix_size)+batch_no*num_elements_mat;
+      
+        // Determine weight
+        //
+      
+        REAL weight = REAL(1);
+      
+        for( unsigned int dim=0; dim<D; dim++ ){	  
+          if( stride.vec[dim] == 0 ){
+            weight *= (REAL(1.0)-(co_disp.vec[dim]-co_stride.vec[dim])); }
+          else{
+            weight *= (REAL(1.0)-(co_stride.vec[dim]-co_disp.vec[dim])); }
+        }
+      
+        locations(0,location_index) = mat_i;
+        locations(1,location_index) = mat_j;
+        values(location_index) = weight;
+        location_index++;
+      }
+    }
+    locations.resize(2,location_index);
+    values.resize(location_index);
+    R_T_ = arma::SpMat<REAL>( locations, values, num_elements_mat*extended_dim, num_elements_ext, false );
+    this->preprocessed_ = true;
+  }
+
+  template <class T, unsigned int D> bool
+  hoLinearResampleOperator<T,D>::is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims )
+  {
+    typedef typename realType<T>::Type REAL;
+
+    for( unsigned int dim=0; dim<D; dim++ ){
+      if( dims[dim] > 1 && ( co[dim] < REAL(0) || co[dim] >= (REAL(dims[dim])-REAL(1)) ) )
+        return true;
+    }
+    return false;
+  }
+
+  template <class T, unsigned int D> unsigned int
+  hoLinearResampleOperator<T,D>::get_num_neighbors()
+  {
+    return 1 << D;
+  }
+  
+  template class EXPORTCPUREG hoLinearResampleOperator<float,1>;
+  template class EXPORTCPUREG hoLinearResampleOperator<float,2>;
+  template class EXPORTCPUREG hoLinearResampleOperator<float,3>;
+  template class EXPORTCPUREG hoLinearResampleOperator<float,4>;
+
+  template class EXPORTCPUREG hoLinearResampleOperator<double,1>;
+  template class EXPORTCPUREG hoLinearResampleOperator<double,2>;
+  template class EXPORTCPUREG hoLinearResampleOperator<double,3>;
+  template class EXPORTCPUREG hoLinearResampleOperator<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.h b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.h
new file mode 100644
index 0000000..481d28e
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+#include "resampleOperator.h"
+#include "complext.h"
+#include "cpureg_export.h"
+#include "hoArmadillo.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTCPUREG hoLinearResampleOperator : public resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >
+  {  
+  public:
+    
+    hoLinearResampleOperator() : resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >() {}
+    virtual ~hoLinearResampleOperator() {}
+  
+    virtual void mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > offsets );
+    virtual void reset();
+
+    virtual boost::shared_ptr< linearOperator< hoNDArray<T> > > clone() {
+      return linearOperator< hoNDArray<T> >::clone(this);
+    }
+  
+  private:
+    inline bool is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims );
+    inline unsigned int get_num_neighbors();
+  
+  protected:
+    arma::SpMat<typename realType<T>::Type> R_T_; //Contains the TRANSPOSED resampling matrix.
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.cpp b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.cpp
new file mode 100644
index 0000000..7b610f8
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.cpp
@@ -0,0 +1,206 @@
+#include "hoLinearResampleOperator_eigen.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+
+#include "GadgetronTimer.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <Eigen/Core>
+
+namespace Gadgetron{
+
+    template <class T, unsigned int D> void
+        hoLinearResampleOperator_eigen<T,D>::mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+    {
+        if( !this->preprocessed_ ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+        }
+
+        if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+        }
+
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, 1, Eigen::Dynamic> > in_vec( in->get_data_ptr(), in->get_number_of_elements() );
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, 1, Eigen::Dynamic> > out_vec( out->get_data_ptr(), out->get_number_of_elements() );
+
+        out_vec = in_vec * (*R_);
+    }
+
+    template <class T, unsigned int D> void
+        hoLinearResampleOperator_eigen<T,D>::mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate )
+    {
+        if( !this->preprocessed_ ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): displacements not set." );
+        }
+
+        if( !in || !in->get_data_ptr() || !out || !out->get_data_ptr() ){
+            throw std::runtime_error("hoLinearResampleOperator::mult_M(): illegal input/output array." );
+        }
+
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, Eigen::Dynamic, 1> > in_vec( in->get_data_ptr(), in->get_number_of_elements() );
+        Eigen::Map< Eigen::Matrix<typename realType<T>::Type, Eigen::Dynamic, 1> > out_vec( out->get_data_ptr(), out->get_number_of_elements() );
+
+        out_vec = (*R_) * in_vec;
+    }
+
+    template <class T, unsigned int D> void
+        hoLinearResampleOperator_eigen<T,D>::set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > displacements )
+    {
+        if( displacements.get() == 0x0 ){
+            throw std::runtime_error("hoLinearResampleOperator_eigen_eigen::set_displacement_field : displacements ptr is 0x0." );
+        }  
+
+        const int surplus = displacements->get_number_of_dimensions()-D;
+
+        if( !( surplus == 1 || surplus == 2 ) ){
+            throw std::runtime_error("hoLinearResampleOperator_eigen::set_displacement_field : unexpected array dimensionality." );
+        }  
+
+        // Determine the number of registrations performed
+        const size_t extended_dim = (surplus == 1) ? 1 : displacements->get_size(D); 
+        temporal_dim_size_ = extended_dim;
+
+        const size_t field_dim = (surplus == 1) ? displacements->get_size(D) : displacements->get_size(D+1);
+
+        if( !(field_dim == D || field_dim == D+1 )){
+            throw std::runtime_error("hoLinearResampleOperator_eigen::set_displacement_field : illegal tailing array dim" );
+        }
+
+        const typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *(displacements->get_dimensions()));
+
+        const size_t num_elements_mat = prod(matrix_size);
+        const size_t num_elements_ext = prod(matrix_size)*extended_dim;
+
+        R_ = boost::shared_ptr< Eigen::SparseMatrix<typename realType<T>::Type> >
+            ( new Eigen::SparseMatrix<typename realType<T>::Type>( num_elements_mat, num_elements_ext ) );
+
+        std::vector< Eigen::Triplet<typename realType<T>::Type> > coefficients;
+
+        for( size_t idx=0; idx<num_elements_ext; idx++ ){
+
+            const size_t batch_no = idx/num_elements_mat;
+            const size_t idx_in_batch = idx-batch_no*num_elements_mat;
+
+            const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+
+            typename reald<typename realType<T>::Type,D>::Type co_disp = to_reald<typename realType<T>::Type,size_t,D>(co);
+            for( size_t dim=0; dim<D; dim++ ){
+                typename realType<T>::Type tmp = displacements->get_data_ptr()[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch];
+                co_disp.vec[dim] += tmp;
+            } 
+
+            // Determine the number of neighbors
+            //
+
+            const typename uint64d<D>::Type twos = to_vector_td<size_t,D>(2);
+            const size_t num_neighbors = this->get_num_neighbors();
+
+            // Weights are non-zero only if all neighbors exist
+            //
+
+            if( this->is_border_pixel(co_disp, matrix_size) )
+                continue;
+
+            // Iterate over all neighbors
+            //
+
+            //
+            // Eigen asks us to build the matrix column by column 
+            // It is more easy then to construct the transpose
+            //
+
+            size_t mat_j = idx;
+            size_t mat_i;
+
+            for( size_t i=0; i<num_neighbors; i++ ){
+
+                // Determine image coordinate of current neighbor
+                //
+
+                const typename uint64d<D>::Type stride = idx_to_co<D>( i, twos );
+
+                if( weak_greater_equal( stride, matrix_size ) ) continue; // For dimensions of size 1
+
+                typename reald<typename realType<T>::Type,D>::Type co_stride;
+
+                for( size_t dim=0; dim<D; dim++ ){
+                    if( stride.vec[dim] == 0 ){
+                        co_stride.vec[dim] = std::floor(co_disp.vec[dim]);
+                    }
+                    else{
+                        co_stride.vec[dim] = std::ceil(co_disp.vec[dim]);
+                        if( co_stride.vec[dim] == co_disp.vec[dim] )
+                            co_stride.vec[dim] += typename realType<T>::Type(1.0);
+                    }
+                }
+
+                // Validate that the coordinate is within the expected range
+                //
+
+                typename uint64d<D>::Type ones = to_vector_td<size_t,D>(1);
+                typename uint64d<D>::Type co_stride_uint64d = to_uint64d<typename realType<T>::Type,D>(co_stride);
+
+                if( weak_greater( co_stride_uint64d, matrix_size-ones ) ){
+
+                    for( size_t dim=0; dim<D; dim++ ){
+                        if( co_stride[dim] < typename realType<T>::Type(0) )
+                            co_stride_uint64d[dim] = 0;
+                        if( co_stride[dim] > (typename realType<T>::Type(matrix_size[dim])-typename realType<T>::Type(1)) )
+                            co_stride_uint64d[dim] = matrix_size[dim]-1;
+                    }
+                }
+
+                mat_i = co_to_idx<D>(co_stride_uint64d, matrix_size);
+
+                // Determine weight
+                //
+
+                typename realType<T>::Type weight = typename realType<T>::Type(1);
+
+                for( size_t dim=0; dim<D; dim++ ){	  
+                    if( stride.vec[dim] == 0 ){
+                        weight *= (typename realType<T>::Type(1.0)-(co_disp.vec[dim]-co_stride.vec[dim])); }
+                    else{
+                        weight *= (typename realType<T>::Type(1.0)-(co_stride.vec[dim]-co_disp.vec[dim])); }
+                }
+
+                // Insert weight in resampling matrix R_
+                //
+
+                //R_->insert( mat_i, mat_j ) =  weight;
+                coefficients.push_back(Eigen::Triplet<typename realType<T>::Type>(mat_i, mat_j, weight));
+            }
+        }  
+        //R_->finalize();
+        R_->setFromTriplets(coefficients.begin(), coefficients.end());
+        this->preprocessed_ = true;
+    }
+
+    template <class T, unsigned int D> bool
+        hoLinearResampleOperator_eigen<T,D>::is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims )
+    {
+        for( size_t dim=0; dim<D; dim++ ){
+            if( dims[dim] > 1 && ( co[dim] < typename realType<T>::Type(0) || co[dim] >= (typename realType<T>::Type(dims[dim])-typename realType<T>::Type(1)) ) )
+                return true;
+        }
+        return false;
+    }
+
+    template <class T, unsigned int D> size_t
+        hoLinearResampleOperator_eigen<T,D>::get_num_neighbors()
+    {
+        return 1 << D;
+    }
+
+
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,1>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,2>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,3>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<float,4>;
+
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,1>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,2>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,3>;
+    template class EXPORTCPUREG hoLinearResampleOperator_eigen<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.h b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.h
new file mode 100644
index 0000000..61b63a2
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoLinearResampleOperator_eigen.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "hoNDArray_math.h"
+
+#include "resampleOperator.h"
+#include "complext.h"
+#include "cpureg_export.h"
+
+#include <armadillo>
+#include <Eigen/Sparse>
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTCPUREG hoLinearResampleOperator_eigen : public resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >
+  {  
+  public:
+  
+    hoLinearResampleOperator_eigen() : resampleOperator<hoNDArray<typename realType<T>::Type>, hoNDArray<T> >() {}
+    virtual ~hoLinearResampleOperator_eigen() {}
+  
+    virtual void mult_M( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void mult_MH( hoNDArray<T> *in, hoNDArray<T> *out, bool accumulate = false);
+    virtual void set_displacement_field( boost::shared_ptr< hoNDArray<typename realType<T>::Type> > offsets );
+  
+    virtual size_t get_temporal_dimension_size() { return temporal_dim_size_; }
+  
+    virtual boost::shared_ptr< linearOperator< hoNDArray<T> > > clone() {
+      return linearOperator< hoNDArray<T> >::clone(this);
+    }
+  
+  private:
+    inline bool is_border_pixel( typename reald<typename realType<T>::Type,D>::Type co, typename uint64d<D>::Type dims );
+    inline size_t get_num_neighbors();
+  
+  protected:
+    boost::shared_ptr< Eigen::SparseMatrix<typename realType<T>::Type> > R_;
+    size_t temporal_dim_size_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.cpp b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.cpp
new file mode 100644
index 0000000..147eae9
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.cpp
@@ -0,0 +1,183 @@
+#include "hoOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+#include <algorithm>
+
+namespace Gadgetron{
+
+  // Helpers
+  //
+
+  template<unsigned int D> inline typename uint64d<D>::Type 
+  compute_stride( size_t dim )
+  {
+    typename uint64d<D>::Type res;
+  
+    for( size_t d=0; d<D; d++ ){
+      res.vec[d] = (d==dim) ? 1 : 0;
+    }
+    return res;
+  }
+
+  template<unsigned int D> inline bool 
+  is_border_pixel_in_stride_dim_before( size_t dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == 0 )
+      return true;
+    else
+      return false;
+  }
+
+  template<unsigned int D> inline bool 
+  is_border_pixel_in_stride_dim_after( size_t dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == (dims.vec[dim]-1) )
+      return true;
+    else
+      return false;
+  }
+    
+  template<class T, unsigned int D> void
+  hoOpticalFlowSolver<T,D>::core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+						  typename uint64d<D>::Type matrix_size, 
+						  size_t num_batches_fixed, 
+						  size_t num_batches_moving )
+  {        
+    // Number of elements per partial derivate
+    const size_t num_elements_per_batch = prod(matrix_size);
+    const size_t num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const size_t num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const size_t num_elements_total = std::max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving)*D;
+  
+    for( size_t idx = 0; idx<num_elements_total; idx++ ){
+    
+      // The (minimum) index in the slowest varying output dimension determines which partial derivative to compute 
+      const size_t stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const size_t stride_dim_moving = idx/(num_elements_per_pdev_moving);
+      const size_t stride_dim = std::min(stride_dim_fixed, stride_dim_moving);
+
+      // Local index to the partial derivative
+      const size_t idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const size_t idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const size_t batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const size_t batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const size_t idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+
+      // Local co to the image
+      const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+ 
+      T res;
+      size_t count = 0;
+
+      //
+      // Find partial derivatives using central differences
+      //
+    
+      const typename uint64d<D>::Type stride = compute_stride<D>(stride_dim);
+      const size_t base_idx_moving = batch_idx_moving*num_elements_per_batch;
+      const size_t base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+
+      size_t stride_base_idx, fixed_idx, moving_idx;
+     
+      // Neighbor "plus stride" side
+      if( !is_border_pixel_in_stride_dim_after<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co+stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = idx_in_batch;
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+      
+      res = (fixed_image[fixed_idx]+moving_image[moving_idx])*T(0.5);
+
+      // Neighbor "minus stride" side
+      if( !is_border_pixel_in_stride_dim_before<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co-stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = co_to_idx<D>(co, matrix_size);
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+    
+      res -= (fixed_image[fixed_idx]+moving_image[moving_idx])*T(0.5);
+
+      if( count == 2 ) // Both neighbors exist
+        res /= T(2);
+
+      // Output result
+      //
+    
+      gradient_image[idx] = res;
+    }
+  }
+  
+  template<class T, unsigned int D> void
+  hoOpticalFlowSolver<T,D>::core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+						   typename uint64d<D>::Type matrix_size, 
+						   size_t num_batches_fixed, 
+						   size_t num_batches_moving )
+  {        
+    // Number of elements per partial derivate
+    const size_t num_elements_per_batch = prod(matrix_size);
+    const size_t num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const size_t num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const size_t num_elements_total = std::max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving);
+  
+    for( size_t idx =0; idx < num_elements_total; idx++ ){
+      
+      // Local index to the partial derivative
+      const size_t stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const size_t stride_dim_moving = idx/(num_elements_per_pdev_moving);
+      const size_t idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const size_t idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const size_t batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const size_t batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const size_t idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+      const size_t base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+      const size_t base_idx_moving = batch_idx_moving*num_elements_per_batch;
+    
+      // Ctr pixel
+      const size_t fixed_idx = idx_in_batch+base_idx_fixed;
+      const size_t moving_idx = idx_in_batch+base_idx_moving;
+    
+      const T res = moving_image[moving_idx]-fixed_image[fixed_idx];
+    
+      // Output result
+      //
+    
+      gradient_image[idx] = res;        
+    }    
+  }
+  
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,1>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,2>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,3>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<float,4>;
+
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,1>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,2>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,3>;
+  template class EXPORTCPUREG hoOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.h b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.h
new file mode 100644
index 0000000..924bb7d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoOpticalFlowSolver.h
@@ -0,0 +1,48 @@
+/** \file hoOpticalFlowSolver.h
+    \brief Abstract class for a CPU-based optical flow registration solver.
+
+    hoOpticalFlowSolver is derived from class opticalFlowSolver 
+    and implements the computation of the spatial and temporal gradients.
+    A pure virtual function is expected to implement the specific algorithm (Horn-Schunck, Cornelius-Kanade).
+*/
+
+#pragma once
+
+#include "hoNDArray.h"
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_utils.h"
+#include "hoNDArray_blas.h"
+#include "hoRegistration_utils.h"
+#include "opticalFlowSolver.h"
+#include "cpureg_export.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTCPUREG hoOpticalFlowSolver 
+    : public opticalFlowSolver< hoNDArray<T>,D >
+  {  
+  public:
+  
+    hoOpticalFlowSolver() : opticalFlowSolver< hoNDArray<T>,D >() {}   
+    virtual ~hoOpticalFlowSolver() {}
+    
+  protected:
+
+    // Inherited and still pure virtual...
+    //virtual boost::shared_ptr< hoNDArray<T> > core_solver( hoNDArray<T> *gradient_image, hoNDArray<T> *stencil_image ) = 0;      
+
+    // CPU-based computation of the spatial and temporal image gradient
+    //
+    
+    virtual void core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+				    typename uint64d<D>::Type matrix_size_moving, 
+				    size_t number_of_batches_fixed, 
+				    size_t number_of_batches_moving );
+    
+    virtual void core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+				     typename uint64d<D>::Type matrix_size_moving, 
+				     size_t number_of_batches_fixed, 
+				     size_t number_of_batches_moving );
+  };  
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoRegistration_utils.cpp b/toolboxes/registration/optical_flow/cpu/hoRegistration_utils.cpp
new file mode 100644
index 0000000..bf6b50e
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoRegistration_utils.cpp
@@ -0,0 +1,233 @@
+#include "hoRegistration_utils.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif 
+
+namespace Gadgetron{
+
+  // Utility to check if all neighbors required for the linear interpolation exists
+  // ... do not include dimensions of size 1
+
+  template<class REAL, unsigned int D> inline bool
+  is_border_pixel( vector_td<size_t,D> co, vector_td<size_t,D> dims )
+  {
+    for( size_t dim=0; dim<D; dim++ ){
+      if( dims[dim] > 1 && ( co[dim] == 0 || co[dim] == (dims[dim]-1) ) )
+	return true;
+    }
+    return false;
+  }
+
+  // Downsample
+  template<class REAL, unsigned int D> 
+  boost::shared_ptr< hoNDArray<REAL> > downsample( hoNDArray<REAL> *_in )
+  {
+    // A few sanity checks 
+
+    if( _in == 0x0 ){
+      throw std::runtime_error( "downsample(): illegal input provided.");
+    }
+    
+    if( _in->get_number_of_dimensions() < D ){
+      throw std::runtime_error( "downsample(): the number of array dimensions should be at least D");
+    }
+    
+    for( size_t d=0; d<D; d++ ){
+      if( (_in->get_size(d)%2) == 1 && _in->get_size(d) != 1 ){
+	throw std::runtime_error( "downsample(): uneven array dimensions larger than one not accepted");
+      }
+    }
+    
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *_in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = matrix_size_in >> 1;
+
+    for( size_t d=0; d<D; d++ ){
+      if( matrix_size_out[d] == 0 ) 
+	matrix_size_out[d] = 1;
+    }
+  
+    size_t num_elements = prod(matrix_size_out);
+    size_t num_batches = 1;
+
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      num_batches *= _in->get_size(d);
+    }
+  
+    std::vector<size_t> dims = to_std_vector(matrix_size_out);
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      dims.push_back(_in->get_size(d));
+    }
+  
+    REAL *in = _in->get_data_ptr();
+
+    boost::shared_ptr< hoNDArray<REAL> > _out( new hoNDArray<REAL>(&dims) );
+    REAL *out = _out->get_data_ptr();
+    
+    typedef vector_td<size_t,D> uint64d;
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long idx=0; idx < num_elements*num_batches; idx++ ){
+
+      const size_t frame_offset = idx/num_elements;
+      const uint64d co_out = idx_to_co<D>( idx-frame_offset*num_elements, matrix_size_out );
+      const uint64d co_in = co_out << 1;
+      const uint64d twos(2);
+      const size_t num_adds = 1 << D;
+
+      size_t actual_adds = 0;
+      REAL res = REAL(0);
+
+      for( size_t i=0; i<num_adds; i++ ){
+	const uint64d local_co = idx_to_co<D>( i, twos );
+	if( weak_greater_equal( local_co, matrix_size_out ) ) continue; // To allow array dimensions of size 1
+	const size_t in_idx = co_to_idx<D>(co_in+local_co, matrix_size_in)+frame_offset*prod(matrix_size_in);
+	actual_adds++;
+	res += in[in_idx];
+      }    
+      out[idx] = res/REAL(actual_adds);
+    }
+
+    return _out;
+  }
+
+  // Linear interpolation upsampling
+  template<class REAL, unsigned int D> boost::shared_ptr< hoNDArray<REAL> >
+  upsample( hoNDArray<REAL> *_in )
+  {
+    // A few sanity checks 
+
+    if( _in == 0x0 ){
+      throw std::runtime_error("upsample(): illegal input provided.");
+    }
+
+    if( _in->get_number_of_dimensions() < D ){
+      throw std::runtime_error( "upsample(): the number of array dimensions should be at least D");
+    }
+    
+    typename uint64d<D>::Type matrix_size_in = from_std_vector<size_t,D>( *_in->get_dimensions() );
+    typename uint64d<D>::Type matrix_size_out = matrix_size_in << 1;
+
+    for( size_t d=0; d<D; d++ ){
+      if( matrix_size_in[d] == 1 )
+	matrix_size_out[d] = 1;
+    }
+  
+    size_t num_elements = prod(matrix_size_out);
+    size_t num_batches = 1;
+
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      num_batches *= _in->get_size(d);
+    }
+  
+    std::vector<size_t> dims = to_std_vector(matrix_size_out);
+    for( size_t d=D; d<_in->get_number_of_dimensions(); d++ ){
+      dims.push_back(_in->get_size(d));
+    }
+
+    REAL *in = _in->get_data_ptr();
+
+    boost::shared_ptr< hoNDArray<REAL> > _out( new hoNDArray<REAL>(&dims) );
+    REAL *out = _out->get_data_ptr();
+    
+    typedef vector_td<size_t,D> uint64d;
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+    for( long long idx=0; idx < num_elements*num_batches; idx++ ){
+      
+      REAL res = REAL(0);
+
+      const size_t num_neighbors = 1 << D;
+      const size_t frame_idx = idx/num_elements;
+      const uint64d co_out = idx_to_co<D>( idx-frame_idx*num_elements, matrix_size_out );
+
+      // We will only proceed if all neighbours exist (this adds a zero-boundary to the upsampled image/vector field)
+      //
+    
+      if( !is_border_pixel<REAL,D>(co_out, matrix_size_out) ){
+      
+	for( size_t i=0; i<num_neighbors; i++ ){
+	
+	  // Determine coordinate of neighbor in input
+	  //
+
+	  const uint64d twos(2);
+	  const uint64d stride = idx_to_co<D>( i, twos );
+
+	  if( weak_greater_equal( stride, matrix_size_out ) ) continue; // To allow array dimensions of 1
+
+	  // Be careful about dimensions of size 1
+	  uint64d ones(1);
+	  for( size_t d=0; d<D; d++ ){
+	    if( matrix_size_out[d] == 1 )
+	      ones[d] = 0;
+	  }
+	  uint64d co_in = ((co_out-ones)>>1)+stride;
+	
+	  // Read corresponding pixel value
+	  //
+	
+	  const size_t in_idx = co_to_idx<D>(co_in, matrix_size_in)+frame_idx*prod(matrix_size_in);
+	  REAL value = in[in_idx];
+	
+	  // Determine weight
+	  //
+	
+	  REAL weight = REAL(1);
+	
+	  for( size_t dim=0; dim<D; dim++ ){	  
+	    if( matrix_size_in[dim] > 1 ){
+	      if( stride.vec[dim] == (co_out.vec[dim]%2) ) {
+		weight *= REAL(0.25);
+	      }
+	      else{
+		weight *= REAL(0.75);
+	      }
+	    }
+	  }
+	
+	  // Accumulate result
+	  //
+	
+	  res += weight*value;
+	}
+      }
+      out[idx] = res;
+    }
+    
+    return _out;
+  }
+
+  //
+  // Instantiation
+  //
+  
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > downsample<float,1>(hoNDArray<float>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > upsample<float,1>(hoNDArray<float>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > downsample<float,2>(hoNDArray<float>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > upsample<float,2>(hoNDArray<float>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > downsample<float,3>(hoNDArray<float>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > upsample<float,3>(hoNDArray<float>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > downsample<float,4>(hoNDArray<float>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<float> > upsample<float,4>(hoNDArray<float>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > downsample<double,1>(hoNDArray<double>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > upsample<double,1>(hoNDArray<double>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > downsample<double,2>(hoNDArray<double>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > upsample<double,2>(hoNDArray<double>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > downsample<double,3>(hoNDArray<double>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > upsample<double,3>(hoNDArray<double>*);
+
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > downsample<double,4>(hoNDArray<double>*);
+  template EXPORTCPUREG boost::shared_ptr< hoNDArray<double> > upsample<double,4>(hoNDArray<double>*);
+}
diff --git a/toolboxes/registration/optical_flow/cpu/hoRegistration_utils.h b/toolboxes/registration/optical_flow/cpu/hoRegistration_utils.h
new file mode 100644
index 0000000..a9afafb
--- /dev/null
+++ b/toolboxes/registration/optical_flow/cpu/hoRegistration_utils.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "hoNDArray.h"
+#include "cpureg_export.h"
+
+namespace Gadgetron{
+  
+  // Downsample array to half size by averaging
+  template<class REAL, unsigned int D> EXPORTCPUREG boost::shared_ptr< hoNDArray<REAL> > downsample( hoNDArray<REAL> *data );
+  
+  // Linear interpolation upsampling to array of doubled dimensions
+  template<class REAL, unsigned int D> EXPORTCPUREG boost::shared_ptr< hoNDArray<REAL> > upsample( hoNDArray<REAL> *data );
+}
diff --git a/toolboxes/registration/optical_flow/gpu/CMakeLists.txt b/toolboxes/registration/optical_flow/gpu/CMakeLists.txt
new file mode 100644
index 0000000..82c4b7d
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/CMakeLists.txt
@@ -0,0 +1,36 @@
+if (WIN32)
+ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUREG__)
+endif (WIN32)
+
+include_directories(   
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/registration/optical_flow
+  ${CUDA_INCLUDE_DIRS}
+)
+
+cuda_add_library(gpureg SHARED 
+  cuOpticalFlowSolver.cu 
+  cuHSOpticalFlowSolver.cu 
+  cuCKOpticalFlowSolver.cu 
+  cuResampleOperator.cu 
+  cuLinearResampleOperator.cu
+#  cuRegistration_utils.cu
+  )
+
+target_link_libraries(gpureg 
+  gpucore 
+  ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}
+  )
+
+install(TARGETS gpureg DESTINATION lib)
+
+install(FILES
+  cuOpticalFlowSolver.h
+  cuHSOpticalFlowSolver.h
+  cuCKOpticalFlowSolver.h
+  gpureg_export.h
+  cuResampleOperator.h
+  cuLinearResampleOperator.h
+#  cuRegistration_utils.h
+  cuCGHSOFSolver.h
+  DESTINATION include)
diff --git a/toolboxes/registration/optical_flow/gpu/cuCGHSOFSolver.h b/toolboxes/registration/optical_flow/gpu/cuCGHSOFSolver.h
new file mode 100644
index 0000000..39b2ba9
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuCGHSOFSolver.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "opticalFlowOperator.h"
+#include "cuPartialDerivativeOperator.h"
+#include "cuCgSolver.h"
+namespace Gadgetron{
+
+
+template<class T, unsigned int D> class cuCGHSOFSolver : public multiresRegistrationSolver<cuNDArray<T>, D>{
+public:
+	cuCGHSOFSolver(){
+
+		OF = boost::shared_ptr<OFOp>(new OFOp);
+		solver = boost::shared_ptr< cuCgSolver<T> >(new cuCgSolver<T>);
+		solver->set_encoding_operator(OF);
+		for (unsigned int i = 0; i < D; i++){
+			boost::shared_ptr<cuPartialDerivativeOperator<T,D> > dx(new cuPartialDerivativeOperator<T,D>(i));
+			solver->add_regularization_operator(dx);
+			ops.push_back(dx);
+		}
+	}
+
+	virtual ~cuCGHSOFSolver(){};
+	typedef opticalFlowOperator<cuNDArray<T>,cuPartialDerivativeOperator<T,D>,D> OFOp;
+
+	virtual void compute( cuNDArray<T> *fixed_image, cuNDArray<T> *moving_image, cuNDArray<T> *stencil_image, boost::shared_ptr<cuNDArray<T> > &result )
+  {
+		std::vector<size_t> dims = *fixed_image->get_dimensions();
+		OF->set_codomain_dimensions(&dims);    
+		OF->set_images(fixed_image,moving_image);
+
+		for (int i = 0; i < ops.size(); i++){
+				ops[i]->set_domain_dimensions(&dims);
+				ops[i]->set_codomain_dimensions(&dims);
+				ops[i]->set_weight(_alpha);
+		}
+
+		dims.push_back(D);
+		OF->set_domain_dimensions(&dims);
+		cuNDArray<T> It(*fixed_image);
+		It -= *moving_image;
+		boost::shared_ptr<cuNDArray<T> > resOp = solver->solve(&It);
+
+		if (result.get()) *result += *resOp;
+		else result = resOp;
+	}
+
+	void set_alpha(T alpha){
+		_alpha = alpha;
+	}
+
+	boost::shared_ptr< cuCgSolver<T> > get_solver(){
+		return solver;
+	}
+
+protected:
+
+	T _alpha;
+	boost::shared_ptr< cuCgSolver<T> > solver;
+	boost::shared_ptr<OFOp> OF;
+	std::vector<boost::shared_ptr<cuPartialDerivativeOperator<T,D> >  >ops;
+};
+
+
+
+
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.cu b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.cu
new file mode 100644
index 0000000..22458e3
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.cu
@@ -0,0 +1,340 @@
+#include "cuCKOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+  //
+  // Kernel prototype declarations
+  //
+
+  template<class REAL, unsigned int D> __global__ 
+  void CorneliusKanade_kernel(REAL*,REAL*,REAL*,REAL*,typename uintd<D>::Type,unsigned int,REAL,REAL,REAL,unsigned int*);
+
+  //
+  // Reference to shared memory
+  //
+
+  extern __shared__ char _shared_mem[];
+
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> >
+  cuCKOpticalFlowSolver<T,D>::core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image )
+  {
+
+    // Sanity checks
+    //
+  
+    if( !gradient_image ){
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+
+    if( gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+  
+    // The dimensions of the displacement field should match the gradient field
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = gradient_image->get_dimensions();
+
+    boost::shared_ptr< cuNDArray<T> > displacements_ping( new cuNDArray<T>(disp_dims.get()));
+    boost::shared_ptr< cuNDArray<T> > displacements_pong( new cuNDArray<T>(disp_dims.get()));
+  
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+    
+    // Setup grid
+    //
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *gradient_image->get_dimensions() );  
+    unsigned int number_of_elements = prod(matrix_size);
+    unsigned int number_of_batches = 1;
+
+    for( unsigned int d=D; d<gradient_image->get_number_of_dimensions()-1; d++ ){
+      number_of_batches *= gradient_image->get_size(d);
+    }
+  
+    dim3 blockDim; dim3 gridDim;
+    this->setup_grid( &blockDim, &gridDim, number_of_elements, number_of_batches*(D+1), true, D+1 );
+  
+    // Allocate continuation flag (used for early Jacobi termination by the kernel)
+    //
+  
+    unsigned int *continue_flag;
+    if( cudaMalloc((void**)&continue_flag, sizeof(unsigned int) ) != cudaSuccess ) {
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): failed to allocate continuation flag.");
+    }
+    
+    unsigned int iteration_no = 0;
+    cuNDArray<T> *ping = displacements_ping.get();
+    cuNDArray<T> *pong = displacements_pong.get();
+
+    if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      std::cout << std::endl;
+    }
+
+    //
+    // Main Jacobi loop
+    //
+
+    while(true){
+    
+      if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+        std::cout << "."; std::cout.flush();
+      }
+    
+      // Clear termination flag
+      //
+    
+      unsigned int _continue_flag = 0;
+      if( cudaMemcpy( continue_flag, &_continue_flag, sizeof(unsigned int), cudaMemcpyHostToDevice ) != cudaSuccess ) {
+        throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): failed to set continuation flag.");
+      }
+    
+      // Invoke kernel
+      //
+    
+      CorneliusKanade_kernel<T,D><<< gridDim, blockDim, (blockDim.x*blockDim.y)*sizeof(T) >>>
+        ( gradient_image->get_data_ptr(), (stencil_image) ? stencil_image->get_data_ptr() : 0x0,
+          ping->get_data_ptr(), pong->get_data_ptr(), 
+          vector_td<unsigned int,D>(matrix_size), number_of_batches, alpha_, beta_, this->limit_*this->limit_, continue_flag );
+      
+      CHECK_FOR_CUDA_ERROR();
+
+      // Swap in/out buffers
+      //
+    
+      cuNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+
+      // Check termination criteria
+      //
+
+      if( cudaMemcpy(&_continue_flag, continue_flag, sizeof(unsigned int), cudaMemcpyDeviceToHost) != cudaSuccess ) {
+        throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to evaluate the continuation flag.");
+      }
+    
+      if( _continue_flag == 0 ){
+        if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+          std::cout << std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl;
+        }
+        break;
+      }
+      
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+        break;    
+      
+      iteration_no++;
+    }
+  
+    if( cudaFree(continue_flag) != cudaSuccess ) {
+      throw std::runtime_error("cuCKOpticalFlowSolver::core_solver(): failed to free continuation flag.");
+    }
+  
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+
+  // Helpers
+  //
+  
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_for_stride( typename intd<D>::Type stride, typename uintd<D>::Type co, typename uintd<D>::Type dims )
+  {
+    for( unsigned int d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+        if( co.vec[d] == 0 ){
+          return true;
+        }
+      }
+      else if( stride.vec[d] == 1 ){
+        if( co.vec[d] == (dims.vec[d]-1) ){
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+  
+  template<unsigned int i, unsigned int j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <unsigned int i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+  
+  // Cornelius-Kanade / Jacobi iteration
+  //
+
+  template<class REAL, unsigned int D> __global__ void
+  CorneliusKanade_kernel( REAL *gradient_image, REAL *stencil_image,
+                          REAL *in_disp, REAL *out_disp, 
+                          typename uintd<D>::Type matrix_size, unsigned int num_batches,
+                          REAL alpha, REAL beta, REAL disp_thresh_sqr, unsigned int *continue_signal )
+  {  
+    
+    // The overall flow dimension corresponding to this thread
+    const unsigned int dim = threadIdx.y;
+
+    // The thread idx relative to the flow dimension
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    // Number of elements per batch
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+  
+    // Number of elements per dim
+    const unsigned int num_elements_per_dim = num_elements_per_batch*num_batches;
+    
+    // We use shared memory to hold the averaged displacements
+    REAL *shared_mem = (REAL*) _shared_mem;
+  
+    //
+    // Find the average velocities (shared memory)
+    //
+  
+    // Batch idx (second slowest varying dimension)   
+    const unsigned int batch_idx = idx/num_elements_per_batch;
+    
+    // Local index to the image (or batch in our terminology)
+    const unsigned int idx_in_batch = idx-batch_idx*num_elements_per_batch;
+
+    // All threads (even out-of-range ones) must reach the synchronization point below
+    //
+
+    bool legal_idx = (idx < num_elements_per_dim);
+  
+    if( legal_idx && stencil_image && stencil_image[idx_in_batch] > REAL(0) )
+      legal_idx = false;
+
+    if( legal_idx ){
+
+      // Local co to the image
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+    
+      const typename intd<D>::Type zeros (0);
+      const typename intd<D>::Type ones(1);
+      const typename intd<D>::Type threes(3);
+    
+      const int num_neighbors = Pow<3,D>::Value;
+      REAL num_contribs = REAL(0);
+    
+      // Idx local to the shared memory
+      const unsigned int shared_idx = threadIdx.y*blockDim.x+threadIdx.x;
+    
+      shared_mem[shared_idx] = REAL(0);
+    
+      // Compute average of neighbors
+      //
+    
+      for( int i=0; i<num_neighbors; i++ ){
+      
+        // Find the stride of the neighbor {-1, 0, 1}^D
+        const typename intd<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+        
+        unsigned int neighbor_idx;
+        
+        const unsigned int base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+        
+        // Verify that the neighbor is not out of bounds (and not the thread itself)
+        if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){	
+          neighbor_idx = (unsigned int) co_to_idx<D>( vector_td<int,D>(co)+stride, vector_td<int,D>(matrix_size)) + base_offset;
+        }
+        else{
+          neighbor_idx = idx_in_batch + base_offset;
+        }
+        
+        shared_mem[shared_idx] += in_disp[neighbor_idx];
+        num_contribs += REAL(1);
+      }
+      
+      // Normalize
+      shared_mem[shared_idx] /= num_contribs;
+    }
+  
+    // Block until all averages have been computed (we need all d dims below)
+    __syncthreads();
+  
+    if( legal_idx ){
+    
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+    
+      REAL phi = REAL(0);
+      REAL norm = REAL(0);
+    
+      typename reald<REAL,D>::Type derivatives;
+    
+      // Contributions from the spatial dimensions
+      //
+      
+      for( unsigned int d=0; d<D; d++ ){
+        derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+        const unsigned int shared_idx = d*blockDim.x+threadIdx.x;
+        phi += (shared_mem[shared_idx]*derivatives.vec[d]);
+        norm += (derivatives.vec[d]*derivatives.vec[d]);
+      }
+      
+      // Contributions from the temporal dimension
+      //
+      
+      phi += gradient_image[D*num_elements_per_dim+idx];
+    
+      // Contribution from the intensity attentuation estimation
+      //
+    
+      phi -= shared_mem[D*blockDim.x+threadIdx.x];
+    
+      // Normalize
+      //
+    
+      phi /= ((alpha/beta)*(alpha/beta)+alpha*alpha+norm);
+    
+      // Form result displacement
+      //
+    
+      const unsigned int shared_idx = dim*blockDim.x+threadIdx.x;
+      REAL result;
+    
+      if( dim<D )
+        result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+      else
+        result = shared_mem[D*blockDim.x+threadIdx.x]+(alpha/beta)*(alpha/beta)*phi;
+      
+      // Clear the "termination" flag if the displacement field has changed above the threshold
+      //
+      
+      REAL delta = result-in_disp[dim*num_elements_per_dim+idx];
+      if( dim < D && delta*delta > disp_thresh_sqr )
+        continue_signal[0] = 1;
+      
+      // Output result
+      //
+      
+      out_disp[dim*num_elements_per_dim+idx] = result;
+    }
+  }
+
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,1>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,2>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,3>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<float,4>;
+
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,1>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,2>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,3>;
+  template class EXPORTGPUREG cuCKOpticalFlowSolver<double,4>;  
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.h b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.h
new file mode 100644
index 0000000..f1caafc
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuCKOpticalFlowSolver.h
@@ -0,0 +1,55 @@
+/** \file cuCKOpticalFlowSolver.h
+    \brief GPU-based Cornelius-Kanade optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "cuOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUREG cuCKOpticalFlowSolver 
+    : public cuOpticalFlowSolver<T, D>
+  {
+  
+  public:
+
+    // Constructors / destructors
+    //
+  
+    cuCKOpticalFlowSolver() : cuOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.05); 
+      beta_ = T(1.0); 
+    } 
+  
+    virtual ~cuCKOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+    inline void set_beta( T beta ) { beta_ = beta; }
+  
+  protected:  
+    virtual boost::shared_ptr< cuNDArray<T> > 
+      core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image );  
+    
+  protected:
+    T alpha_;
+    T beta_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.cu b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.cu
new file mode 100644
index 0000000..f7bf0ea
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.cu
@@ -0,0 +1,326 @@
+#include "cuHSOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+
+namespace Gadgetron{
+
+  //
+  // Kernel prototype declarations
+  //
+
+  template<class REAL, unsigned int D> __global__ 
+  void HornSchunk_kernel(REAL*,REAL*,REAL*,REAL*,typename uintd<D>::Type,unsigned int,REAL,REAL,unsigned int*);
+
+  //
+  // Reference to shared memory
+  //
+
+  extern __shared__ char _shared_mem[];
+
+  //
+  // Implementation
+  //
+
+  template<class T, unsigned int D> boost::shared_ptr< cuNDArray<T> >
+  cuHSOpticalFlowSolver<T,D>::core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image )
+  {
+    // Sanity checks
+    //
+  
+    if( !gradient_image ){
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): illegal input gradient image received.");
+    }
+  
+    if( gradient_image->get_number_of_dimensions() <= D ){
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): number of gradient image dimensions is too small.");
+    }
+    
+    // The dimensions of the displacement field should match the gradient field
+    // - when removing the temporal gradient component (replacing D+1 with D)
+    //
+  
+    boost::shared_ptr< std::vector<size_t> > disp_dims = gradient_image->get_dimensions();
+    disp_dims->pop_back(); disp_dims->push_back(D);
+
+    boost::shared_ptr< cuNDArray<T> > displacements_ping(new cuNDArray<T>(disp_dims.get()));
+    boost::shared_ptr< cuNDArray<T> > displacements_pong(new cuNDArray<T>(disp_dims.get()));
+  
+    clear(displacements_ping.get());
+    clear(displacements_pong.get());
+      
+    // Setup grid
+    //
+
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *gradient_image->get_dimensions() );  
+    unsigned int number_of_elements = prod(matrix_size);
+    unsigned int number_of_batches = 1;
+  
+    for( unsigned int d=D; d<gradient_image->get_number_of_dimensions()-1; d++ ){
+      number_of_batches *= gradient_image->get_size(d);
+    }
+  
+    dim3 blockDim; dim3 gridDim;
+    this->setup_grid( &blockDim, &gridDim, number_of_elements, number_of_batches*D, true, D );
+  
+    // Allocate continuation flag (used for early Jacobi termination by the kernel)
+    //
+  
+    unsigned int *continue_flag;
+    if( cudaMalloc((void**)&continue_flag, sizeof(unsigned int) ) != cudaSuccess ) {
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to allocate continuation flag.");
+    }
+  
+    unsigned int iteration_no = 0;
+    cuNDArray<T> *ping = displacements_ping.get();
+    cuNDArray<T> *pong = displacements_pong.get();
+
+    if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+      std::cout << std::endl;
+    }
+
+    //
+    // Main Jacobi loop
+    //
+
+    while(true){
+    
+      if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+        std::cout << "."; std::cout.flush();
+      }
+    
+      // Clear termination flag
+      //
+    
+      unsigned int _continue_flag = 0;
+      if( cudaMemcpy( continue_flag, &_continue_flag, sizeof(unsigned int), cudaMemcpyHostToDevice ) != cudaSuccess ) {
+        throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to set continuation flag.");
+      }
+    
+      // Invoke kernel
+      //
+    
+      HornSchunk_kernel<T,D><<< gridDim, blockDim, (blockDim.x*blockDim.y)*sizeof(T) >>>
+        ( gradient_image->get_data_ptr(), (stencil_image) ? stencil_image->get_data_ptr() : 0x0,
+          ping->get_data_ptr(), pong->get_data_ptr(),
+          vector_td<unsigned int,D>(matrix_size), number_of_batches, alpha_, this->limit_*this->limit_, continue_flag );
+    
+      CHECK_FOR_CUDA_ERROR();
+
+      // Swap in/out buffers
+      //
+    
+      cuNDArray<T> *tmp = ping;
+      ping = pong;
+      pong = tmp;
+
+      // Check termination criteria
+      //
+
+      if( cudaMemcpy(&_continue_flag, continue_flag, sizeof(unsigned int), cudaMemcpyDeviceToHost) != cudaSuccess ) {
+        throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to evaluate the continuation flag.");
+      }
+    
+      if( _continue_flag == 0 ){
+        if( this->output_mode_ >= cuOpticalFlowSolver<T,D>::OUTPUT_VERBOSE ) {
+          std::cout << std::endl << "Break after " << iteration_no+1 << " iterations" << std::endl;
+        }
+        break;
+      }
+    
+      if( iteration_no > this->max_num_iterations_per_level_ ) 
+        break;    
+    
+      iteration_no++;
+    }
+  
+    if( cudaFree(continue_flag) != cudaSuccess ) {
+      throw std::runtime_error("cuHSOpticalFlowSolver::core_solver(): failed to free continuation flag.");
+    }
+    
+    if( ping == displacements_ping.get() )   
+      return displacements_ping;
+    else
+      return displacements_pong;
+  }
+  
+  // Helpers
+  //
+  
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_for_stride( typename intd<D>::Type stride, typename uintd<D>::Type co, typename uintd<D>::Type dims )
+  {
+    for( unsigned int d=0; d<D; d++ ){
+      if( stride.vec[d] == -1 ){
+        if( co.vec[d] == 0 ){
+          return true;
+        }
+      }
+      else if( stride.vec[d] == 1 ){
+        if( co.vec[d] == (dims.vec[d]-1) ){
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+  
+  template<unsigned int i, unsigned int j> struct Pow
+  {
+    enum { Value = i*Pow<i,j-1>::Value };
+  };
+  
+  template <unsigned int i> struct Pow<i,1>
+  {
+    enum { Value = i };
+  };
+  
+  // Horn-Schunk / Jacobi iteration
+  //
+  
+  template<class REAL, unsigned int D> __global__ void
+  HornSchunk_kernel( REAL *gradient_image, REAL *stencil_image,
+                     REAL *in_disp, REAL *out_disp, 
+                     typename uintd<D>::Type matrix_size, unsigned int num_batches,
+                     REAL alpha, REAL disp_thresh_sqr, unsigned int *continue_signal )
+  {  
+    
+    // The overall flow dimension corresponding to this thread
+    const unsigned int dim = threadIdx.y;
+    
+    // The thread idx relative to the flow dimension
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    
+    // Number of elements per batch
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+    
+    // Number of elements per dim
+    const unsigned int num_elements_per_dim = num_elements_per_batch*num_batches;
+    
+    // We use shared memory to hold the averaged displacements
+    REAL *shared_mem = (REAL*) _shared_mem;
+    
+    //
+    // Find the average velocities (shared memory)
+    //
+    
+    // Batch idx (second slowest varying dimension)   
+    const unsigned int batch_idx = idx/num_elements_per_batch;
+    
+    // Local index to the image (or batch in our terminology)
+    const unsigned int idx_in_batch = idx-batch_idx*num_elements_per_batch;
+    
+    // All threads (even out-of-range ones) must reach the synchronization point below
+    //
+    
+    bool legal_idx = (idx < num_elements_per_dim);
+    
+    if( legal_idx && stencil_image && stencil_image[idx_in_batch] > REAL(0) )
+      legal_idx = false;
+    
+    if( legal_idx ){
+      
+      // Local co to the image
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+      
+      const typename intd<D>::Type zeros(0);
+      const typename intd<D>::Type ones(1);
+      const typename intd<D>::Type threes(3);
+      
+      const int num_neighbors = Pow<3,D>::Value;
+      REAL num_contribs = REAL(0);
+      
+      // Idx local to the shared memory
+      const unsigned int shared_idx = threadIdx.y*blockDim.x+threadIdx.x;
+      
+      shared_mem[shared_idx] = REAL(0);
+      
+      for( int i=0; i<num_neighbors; i++ ){
+	
+        // Find the stride of the neighbor {-1, 0, 1}^D
+        const typename intd<D>::Type stride = idx_to_co<D>( i, threes ) - ones;
+	
+        // Verify that the neighbor is not out of bounds (and not the thread itself)
+        if( !is_border_pixel_for_stride<D>( stride, co, matrix_size ) && !(stride==zeros) ){
+	  
+          // Compute average of neighbors
+          //
+	  
+          const unsigned int base_offset = dim*num_elements_per_dim + batch_idx*num_elements_per_batch;
+          const unsigned int neighbor_idx = (unsigned int) co_to_idx<D>( vector_td<int,D>(co)+stride, vector_td<int,D>(matrix_size)) + base_offset;
+	  
+          shared_mem[shared_idx] += in_disp[neighbor_idx];
+          num_contribs += REAL(1);
+        }
+      }
+      
+      // Normalize
+      shared_mem[shared_idx] /= num_contribs;       	
+    }
+    
+    // Block until all averages have been computed (we need all d dims below)
+    __syncthreads();
+    
+    if( legal_idx ){
+      
+      //
+      // Update displacement field (Jacobi iteration)
+      //
+      
+      REAL phi = REAL(0);
+      REAL norm = REAL(0);
+      
+      typename reald<REAL,D>::Type derivatives;
+      
+      // Contributions from the spatial dimensions
+      //
+      
+      for( unsigned int d=0; d<D; d++ ){
+        derivatives.vec[d] = gradient_image[d*num_elements_per_dim+idx];
+        const unsigned int shared_idx = d*blockDim.x+threadIdx.x;
+        phi += (shared_mem[shared_idx]*derivatives.vec[d]);
+        norm += (derivatives.vec[d]*derivatives.vec[d]);
+      }
+      
+      // Contributions from the temporal dimension
+      //
+      
+      phi += gradient_image[D*num_elements_per_dim+idx];
+      
+      // Normalize
+      //
+      
+      phi /= (alpha*alpha+norm);
+      
+      // Form result displacement
+      //
+      
+      const unsigned int shared_idx = dim*blockDim.x+threadIdx.x;
+      REAL result = shared_mem[shared_idx]-derivatives.vec[dim]*phi;
+      
+      // Clear the "termination" flag if the displacement field has changed above the threshold
+      //
+      
+      REAL delta = result-in_disp[dim*num_elements_per_dim+idx];
+      if( delta*delta > disp_thresh_sqr )
+        continue_signal[0] = 1;
+      
+      // Output result
+      //
+      
+      out_disp[dim*num_elements_per_dim+idx] = result;
+    }
+  }
+  
+  // 
+  // Template instantiation
+  //
+  
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,1>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,2>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,3>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<float,4>;
+  
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,1>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,2>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,3>;
+  template class EXPORTGPUREG cuHSOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.h b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.h
new file mode 100644
index 0000000..3a5f73f
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuHSOpticalFlowSolver.h
@@ -0,0 +1,52 @@
+/** \file cuHSOpticalFlowSolver.h
+    \brief GPU-based Horn-Schunck optical flow registration solver.
+
+    References to the solver implementation and some usage scenarios can be found in:
+
+    An optimised multi-baseline approach for on-line MR-temperature monitoring on commodity graphics hardware
+    BD de Senneville, KØ Noe, M Ries, M Pedersen, CTW Moonen, TS Sørensen.
+    5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008. pp. 1513-1516.
+
+    Acceleration and validation of optical flow based deformable registration for image-guided radiotherapy.
+    KØ Noe, BD de Senneville, UV Elstrøm, K Tanderup, TS Sørensen.
+    Acta Oncologica 2008; 47(7): 1286-1293.
+
+    Retrospective reconstruction of high temporal resolution cine images from real‐time MRI using iterative motion correction
+    MS Hansen, TS Sørensen, AE Arai, P Kellman.
+    Magnetic Resonance in Medicine 2012; 68(3): 741-750.
+*/
+
+#pragma once
+
+#include "cuOpticalFlowSolver.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUREG cuHSOpticalFlowSolver 
+    : public cuOpticalFlowSolver<T, D>
+  {
+  
+  public:
+
+    // Constructors / destructors
+    //
+  
+    cuHSOpticalFlowSolver() : cuOpticalFlowSolver<T,D>(){ 
+      alpha_ = T(0.1); 
+    } 
+  
+    virtual ~cuHSOpticalFlowSolver() {}
+  
+    // Set the regularization weight
+    //
+  
+    inline void set_alpha( T alpha ) { alpha_ = alpha; }
+  
+  protected:  
+    virtual boost::shared_ptr< cuNDArray<T> > 
+      core_solver( cuNDArray<T> *gradient_image, cuNDArray<T> *stencil_image );
+    
+  protected:
+    T alpha_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.cu b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.cu
new file mode 100644
index 0000000..8d5acd9
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.cu
@@ -0,0 +1,265 @@
+#include "cuLinearResampleOperator.h"
+#include "cuNDArray_reductions.h"
+#include "cuResampleOperator_macros.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+#include "setup_grid.h"
+
+namespace Gadgetron{
+
+  //
+  // Check if all neighbors required for the linear interpolation exists
+  // 
+
+  template<class REAL, unsigned int D> __device__ 
+  bool is_border_pixel( typename reald<REAL,D>::Type co, typename uintd<D>::Type dims )
+  {
+    for( unsigned int dim=0; dim<D; dim++ ){
+      if( dims[dim] > 1 && ( co[dim] < REAL(0) || co[dim] >= (REAL(dims[dim])-REAL(1)) ) )
+        return true;
+    }
+    return false;
+  }
+  
+  template<unsigned int D> static __inline__ __host__ __device__ 
+  unsigned int _get_num_neighbors()
+  {
+    return 1 << D;
+  }
+
+  template<class T, unsigned int D> unsigned int
+  cuLinearResampleOperator<T,D>::get_num_neighbors()
+  {
+    return _get_num_neighbors<D>();
+  }
+
+  //
+  // Linear interpolation
+  //
+
+  template<class T, unsigned int D> __device__ 
+  T interpolate( unsigned int batch_no, 
+                 typename reald<typename realType<T>::Type,D>::Type co, 
+                 typename uintd<D>::Type matrix_size, 
+                 T *image )
+  {
+    typedef typename realType<T>::Type REAL;
+
+    // We will only proceed if all neighbours exist
+    //
+
+    if( is_border_pixel<REAL,D>(co, matrix_size) )
+      return T(0);
+
+    // To hold the result
+    //
+
+    T res = T(0);
+
+    // Iterate over all neighbors
+    //
+
+    const typename uintd<D>::Type twos(2);
+    const unsigned int num_neighbors = _get_num_neighbors<D>();
+  
+    for( unsigned int i=0; i<num_neighbors; i++ ){
+    
+      // Determine image coordinate of current neighbor
+      //
+
+      const typename uintd<D>::Type stride = idx_to_co<D>( i, twos );
+
+      if( weak_greater_equal( stride, matrix_size ) ) continue; // For dimensions of size 1
+
+      typename reald<REAL,D>::Type co_stride;
+
+      for( unsigned int dim=0; dim<D; dim++ ){
+        if( stride.vec[dim] == 0 ){
+          co_stride.vec[dim] = ::floor(co.vec[dim]);
+        }
+        else{
+          co_stride.vec[dim] = ::ceil(co.vec[dim]);
+          if( co_stride.vec[dim] == co.vec[dim] )
+            co_stride.vec[dim] += REAL(1.0);
+        }
+      }
+      
+      // Read corresponding pixel value
+      //
+    
+      T image_value = image[co_to_idx<D>(vector_td<unsigned int,D>(co_stride), matrix_size) + batch_no*prod(matrix_size)];
+    
+      // Determine weight
+      //
+
+      REAL weight = REAL(1);
+
+      for( unsigned int dim=0; dim<D; dim++ ){
+
+        if( stride.vec[dim] == 0 ){
+          weight *= (REAL(1.0)-(co.vec[dim]-co_stride.vec[dim]));
+        }
+        else{
+          weight *= (REAL(1.0)-(co_stride.vec[dim]-co.vec[dim]));
+        }
+      }
+      
+      // Accumulate result
+      //
+    
+      res += (weight * image_value);
+    }
+
+    // All done, return result
+    //
+
+    return res;
+  }
+
+  template<class REAL, unsigned int D> __global__ void
+  write_sort_arrays_kernel( typename uintd<D>::Type matrix_size, unsigned int extended_size, REAL *displacements,
+                            unsigned int *sort_keys, unsigned int *sort_values_indices, REAL *sort_values_weights )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+    const unsigned int num_elements_mat = prod(matrix_size);
+    const unsigned int num_elements_ext = prod(matrix_size)*extended_size;
+  
+    if( idx < num_elements_ext ){
+
+      const unsigned int batch_no = idx/num_elements_mat;
+      const unsigned int idx_in_batch = idx-batch_no*num_elements_mat;
+    
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co);
+      for( unsigned int dim=0; dim<D; dim++ )
+        co_disp.vec[dim] +=  displacements[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch];
+    
+      // Determine the number of neighbors
+      //
+    
+      const typename uintd<D>::Type twos(2);
+      const unsigned int num_neighbors = _get_num_neighbors<D>();
+
+      // Weights are non-zero only if all neighbors exist
+      //
+    
+      bool non_zero = !is_border_pixel<REAL,D>(co_disp, matrix_size);
+
+      // Iterate over all neighbors
+      //
+    
+      for( unsigned int i=0; i<num_neighbors; i++ ){
+      
+        // Write out the sort values/indices
+        //
+        
+        sort_values_indices[idx+i*num_elements_ext] = idx;
+        
+        // Determine image coordinate of current neighbor
+        //
+        
+        const typename uintd<D>::Type stride = idx_to_co<D>( i, twos );
+        
+        if( weak_greater_equal( stride, matrix_size ) ) non_zero = false; // For dimensions of size 1
+        
+        typename reald<REAL,D>::Type co_stride;
+        
+        if( non_zero ){
+          for( unsigned int dim=0; dim<D; dim++ ){
+            if( stride.vec[dim] == 0 ){
+              co_stride.vec[dim] = ::floor(co_disp.vec[dim]);
+            }
+            else{
+              co_stride.vec[dim] = ::ceil(co_disp.vec[dim]);
+              if( co_stride.vec[dim] == co_disp.vec[dim] )
+                co_stride.vec[dim] += REAL(1.0);
+            }
+          }
+          
+          // Write out sort keys (moving image resampling indices).
+          //
+          
+          sort_keys[idx+i*num_elements_ext] = co_to_idx<D>(vector_td<unsigned int,D>(co_stride), matrix_size) + batch_no*num_elements_mat;
+        }
+        else{
+          sort_keys[idx+i*num_elements_ext] = idx; // Could be anything, weight is zero
+        }
+        
+        // Determine weight
+        //
+        
+        REAL weight = (non_zero) ? REAL(1) : REAL(0);
+        
+        if( non_zero ){
+          for( unsigned int dim=0; dim<D; dim++ ){	  
+            if( stride.vec[dim] == 0 ){
+              weight *= (REAL(1.0)-(co_disp.vec[dim]-co_stride.vec[dim])); }
+            else{
+              weight *= (REAL(1.0)-(co_stride.vec[dim]-co_disp.vec[dim])); }
+          }
+        }
+        
+        // Write out the sort values/weights
+        //
+
+        sort_values_weights[idx+i*num_elements_ext] = weight;
+      }
+    }
+  };
+
+  template<class T, unsigned int D> void 
+  cuLinearResampleOperator<T,D>::write_sort_arrays( thrust::device_vector<unsigned int> &sort_keys )
+  {
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>(*this->offsets_->get_dimensions().get());
+    int surplus = this->offsets_->get_number_of_dimensions()-D;
+    unsigned int extended_dim = (surplus == 1) ? 1 : this->offsets_->get_size(D);
+  
+    dim3 blockDim, gridDim;
+    setup_grid( prod(matrix_size)*extended_dim, &blockDim, &gridDim );
+    
+    write_sort_arrays_kernel<typename realType<T>::Type,D><<< gridDim, blockDim >>>
+      ( vector_td<unsigned int,D>(matrix_size), extended_dim, this->offsets_->get_data_ptr(),
+        raw_pointer_cast(&(sort_keys[0])),
+        raw_pointer_cast(&(this->indices_)[0]),
+        raw_pointer_cast(&(this->weights_)[0]) );
+    
+    CHECK_FOR_CUDA_ERROR();
+  };
+  
+  // This macro is a workaround for Cudas missing support for pure virtual functions.
+  // It defines mult_M and mult_MH and intended to be shared among all classes derived from cuResampleOperator.
+  //
+  // 'cu' is automatically appendex to the macro argument (a workaround for the workaround).
+  //
+  
+  DECLARE_CU_RESAMPLE_OPERATOR_SUPPORT(LinearResampleOperator)
+  
+  // 
+  // Instantiation
+  //
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,1>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,1>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,2>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,2>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,3>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,3>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<float,4>;
+  template class EXPORTGPUREG cuLinearResampleOperator<float_complext,4>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,1>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,1>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,2>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,2>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,3>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,3>;
+
+  template class EXPORTGPUREG cuLinearResampleOperator<double,4>;
+  template class EXPORTGPUREG cuLinearResampleOperator<double_complext,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.h b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.h
new file mode 100644
index 0000000..371806e
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuLinearResampleOperator.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "cuResampleOperator.h"
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTGPUREG cuLinearResampleOperator : public cuResampleOperator<T,D>
+  {  
+  public:
+  
+    cuLinearResampleOperator() : cuResampleOperator<T,D>() {}
+    virtual ~cuLinearResampleOperator() {}
+  
+    virtual void mult_M( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate = false);
+    virtual void mult_MH( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate = false);
+  
+    virtual boost::shared_ptr< linearOperator< cuNDArray<T> > > clone() {
+      return linearOperator< cuNDArray<T> >::clone(this);
+    }
+  
+  protected:
+    virtual unsigned int get_num_neighbors();
+    virtual void write_sort_arrays( thrust::device_vector<unsigned int> &sort_keys );
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.cu b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.cu
new file mode 100644
index 0000000..da4aebf
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.cu
@@ -0,0 +1,303 @@
+#include "cuOpticalFlowSolver.h"
+#include "vector_td_utilities.h"
+#include "check_CUDA.h"
+
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  //
+  // Kernel prototype declarations
+  //
+
+  template<class REAL, unsigned int D> __global__ 
+  void spatial_grad_kernel(REAL*,REAL*,REAL*,typename uint64d<D>::Type,unsigned int,unsigned int);
+
+  template<class REAL, unsigned int D> __global__ 
+  void temporal_grad_kernel(REAL*,REAL*,REAL*,typename uint64d<D>::Type,unsigned int,unsigned int);
+
+  // There is some issue about Cuda defining min/max incompatibly...
+  //
+
+  template <class T> __host__ __device__ const T& _cuOF_max (const T& a, const T& b) {
+    return (a<b)?b:a;
+  }
+
+  template <class T> __host__ __device__ const T& _cuOF_min (const T& a, const T& b) {
+    return (a>b)?b:a;
+  }
+
+  template<class T, unsigned int D> void
+  cuOpticalFlowSolver<T,D>::setup_grid( dim3 *blockDim, dim3* gridDim, 
+					   unsigned int number_of_elements, 
+					   unsigned int num_batches, 
+					   bool use_2d_blocks, 
+					   unsigned int num_unknowns )
+  {
+    int device;
+    cudaDeviceProp deviceProp; 
+  
+    if( cudaGetDevice( &device ) != cudaSuccess) {
+      throw std::runtime_error("cuOpticalFlowSolver::setup_grid(): unable to determine current device");
+    }
+    
+    if( cudaGetDeviceProperties( &deviceProp, device ) != cudaSuccess) {
+      throw std::runtime_error("cuOpticalFlowSolver::setup_grid(): unable to query current device");
+    }
+    
+    int max_blockdim = deviceProp.maxThreadsDim[0];
+    int max_griddim  = deviceProp.maxGridSize[0];
+    int warp_size    = deviceProp.warpSize;
+    
+    // For small arrays we keep the block dimension fairly small
+    if( use_2d_blocks )
+      *blockDim = dim3(((256/num_unknowns)/warp_size)*warp_size, num_unknowns);
+    else
+      *blockDim = dim3(256);
+  
+    *gridDim = dim3((number_of_elements+(blockDim->x*blockDim->y)-1)/(blockDim->x*blockDim->y), num_batches);
+
+    // Extend block/grid dimensions for large arrays
+    if( gridDim->x > max_griddim ){
+      if( use_2d_blocks )
+        blockDim->x = ((max_blockdim/num_unknowns)/warp_size)*warp_size;
+      else
+        blockDim->x = max_blockdim;
+    
+      gridDim->x = (number_of_elements+(blockDim->x*blockDim->y)-1)/(blockDim->x*blockDim->y);
+    }
+
+    if( gridDim->x > max_griddim ){
+      gridDim->x = ((unsigned int)std::sqrt((T)number_of_elements)+(blockDim->x*blockDim->y)-1)/(blockDim->x*blockDim->y);
+      gridDim->y *= ((number_of_elements+(blockDim->x*blockDim->y)*gridDim->x-1)/((blockDim->x*blockDim->y)*gridDim->x));
+    }
+   
+    if( gridDim->x > max_griddim || gridDim->y > max_griddim ){      
+      throw std::runtime_error("cuOpticalFlowSolver::setup_grid(): maximum grid dimensions exceeded");
+    }
+  }
+  
+  template<class T, unsigned int D> void
+  cuOpticalFlowSolver<T,D>::core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+						  typename uint64d<D>::Type matrix_size_moving, 
+						  size_t number_of_batches_fixed, 
+						  size_t number_of_batches_moving )
+  {        
+    unsigned int number_of_elements = prod(matrix_size_moving);
+    dim3 blockDim; dim3 gridDim;
+
+    setup_grid( &blockDim, &gridDim, number_of_elements, _cuOF_max(number_of_batches_moving, number_of_batches_fixed)*D );
+    
+    // Invoke kernel (spatial partial derivatives)
+    spatial_grad_kernel<T,D><<< gridDim, blockDim >>>
+      ( fixed_image, moving_image, gradient_image, matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+  
+  template<class T, unsigned int D> void
+  cuOpticalFlowSolver<T,D>::core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+						   typename uint64d<D>::Type matrix_size_moving, 
+						   size_t number_of_batches_fixed, 
+						   size_t number_of_batches_moving )
+  {        
+    unsigned int number_of_elements = prod(matrix_size_moving);
+    dim3 blockDim; dim3 gridDim;
+    
+    setup_grid( &blockDim, &gridDim, number_of_elements, _cuOF_max(number_of_batches_moving, number_of_batches_fixed) );
+    
+    // Invoke kernel (temporal partial derivative)
+    temporal_grad_kernel<T,D><<< gridDim, blockDim >>>
+      ( fixed_image, moving_image, gradient_image,
+        matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+    
+    CHECK_FOR_CUDA_ERROR();
+  }
+  
+  // Helpers
+  //
+
+  template<unsigned int D> __device__ 
+  typename uint64d<D>::Type compute_stride( unsigned int dim )
+  {
+    typename uint64d<D>::Type res;
+  
+    for( unsigned int d=0; d<D; d++ ){
+      res.vec[d] = (d==dim) ? 1 : 0;
+    }
+    return res;
+  }
+
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_in_stride_dim_before( unsigned int dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == 0 )
+      return true;
+    else
+      return false;
+  }
+
+  template<unsigned int D> __device__ 
+  bool is_border_pixel_in_stride_dim_after( unsigned int dim, typename uint64d<D>::Type co, typename uint64d<D>::Type dims )
+  {
+    if( co.vec[dim] == (dims.vec[dim]-1) )
+      return true;
+    else
+      return false;
+  }
+
+  // Spatial partial derivatives
+  //
+
+  template<class REAL, unsigned int D> __global__ void
+  spatial_grad_kernel( REAL *fixed_image, REAL *moving_image, REAL *gradient_image, 
+                       typename uint64d<D>::Type matrix_size, 
+                       unsigned int num_batches_fixed, unsigned int num_batches_moving )
+  {
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    // Number of elements per partial derivate
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+    const unsigned int num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const unsigned int num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const unsigned int num_elements_total = _cuOF_max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving)*D;
+  
+    if( idx < num_elements_total ){
+    
+      // The (minimum) index in the slowest varying output dimension determines which partial derivative to compute 
+      const unsigned int stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const unsigned int stride_dim_moving = idx/(num_elements_per_pdev_moving);
+      const unsigned int stride_dim = _cuOF_min(stride_dim_fixed, stride_dim_moving);
+
+      // Local index to the partial derivative
+      const unsigned int idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const unsigned int idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const unsigned int batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const unsigned int batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const unsigned int idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+
+      // Local co to the image
+      const typename uint64d<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size );
+ 
+      REAL res;
+      unsigned int count = 0;
+
+      //
+      // Find partial derivatives using central differences
+      //
+    
+      typename uint64d<D>::Type stride = compute_stride<D>(stride_dim);
+    
+      const unsigned int base_idx_moving = batch_idx_moving*num_elements_per_batch;
+      const unsigned int base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+
+      unsigned int stride_base_idx, fixed_idx, moving_idx;
+     
+      // Neighbor "plus stride" side
+      if( !is_border_pixel_in_stride_dim_after<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co+stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = idx_in_batch;
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+    
+      res = (fixed_image[fixed_idx]+moving_image[moving_idx])*REAL(0.5);
+
+      // Neighbor "minus stride" side
+      if( !is_border_pixel_in_stride_dim_before<D>( stride_dim, co, matrix_size )){
+        stride_base_idx = co_to_idx<D>(co-stride, matrix_size);
+        count++;
+      }
+      else{
+        stride_base_idx = co_to_idx<D>(co, matrix_size);
+      }
+    
+      fixed_idx = stride_base_idx+base_idx_fixed;
+      moving_idx = stride_base_idx+base_idx_moving;
+    
+      res -= (fixed_image[fixed_idx]+moving_image[moving_idx])*REAL(0.5);
+
+      if( count == 2 ) // Both neighbors exist
+        res /= REAL(2);
+
+      // Output result
+      //
+    
+      gradient_image[idx] = res;
+    }
+  }
+
+  // Temporal partial derivatives
+  //
+
+  template<class REAL, unsigned int D> __global__ void
+  temporal_grad_kernel( REAL *fixed_image, REAL *moving_image, REAL *gradient_image, 
+                        typename uint64d<D>::Type matrix_size, 
+                        unsigned int num_batches_fixed, unsigned int num_batches_moving )
+  { 
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x;
+
+    // Number of elements per partial derivate
+    const unsigned int num_elements_per_batch = prod(matrix_size);
+    const unsigned int num_elements_per_pdev_fixed = num_elements_per_batch*num_batches_fixed;
+    const unsigned int num_elements_per_pdev_moving = num_elements_per_batch*num_batches_moving;
+
+    // Total number of elements for all partial derivatives
+    const unsigned int num_elements_total = _cuOF_max(num_elements_per_pdev_fixed, num_elements_per_pdev_moving);
+  
+    if( idx < num_elements_total ){
+    
+      const unsigned int stride_dim_fixed = idx/(num_elements_per_pdev_fixed);
+      const unsigned int stride_dim_moving = idx/(num_elements_per_pdev_moving);
+
+      // Local index to the partial derivative
+      const unsigned int idx_in_pdev_fixed = idx-stride_dim_fixed*num_elements_per_pdev_fixed;
+      const unsigned int idx_in_pdev_moving = idx-stride_dim_moving*num_elements_per_pdev_moving;
+
+      // Batch idx (second slowest varying dimension)   
+      const unsigned int batch_idx_fixed = idx_in_pdev_fixed/num_elements_per_batch;
+      const unsigned int batch_idx_moving = idx_in_pdev_moving/num_elements_per_batch;
+
+      // Local index to the batch (should be identical for the fixed/moving image)
+      const unsigned int idx_in_batch = idx_in_pdev_moving-batch_idx_moving*num_elements_per_batch;
+
+      const unsigned int base_idx_fixed = batch_idx_fixed*num_elements_per_batch;
+      const unsigned int base_idx_moving = batch_idx_moving*num_elements_per_batch;
+    
+      // Ctr pixel
+      const unsigned int fixed_idx = idx_in_batch+base_idx_fixed;
+      const unsigned int moving_idx = idx_in_batch+base_idx_moving;
+    
+      const REAL res = moving_image[moving_idx]-fixed_image[fixed_idx];
+    
+      // Output result
+      //
+    
+      gradient_image[idx] = res;        
+    }    
+  }
+
+  // 
+  // Template instantiation
+  //
+
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,1>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,2>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,3>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<float,4>;
+
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,1>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,2>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,3>;
+  template class EXPORTGPUREG cuOpticalFlowSolver<double,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.h b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.h
new file mode 100644
index 0000000..0ee98cf
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuOpticalFlowSolver.h
@@ -0,0 +1,50 @@
+/** \file cuOpticalFlowSolver.h
+    \brief Abstract class for a GPU-based optical flow registration solver.
+
+    cuOpticalFlowSolver is derived from class opticalFlowSolver 
+    and implements the computation of the spatial and temporal gradients.
+    A pure virtual function is expected to implement the specific algorithm (Horn-Schunck, Cornelius-Kanade).
+*/
+
+#pragma once
+
+#include "cuNDArray.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "cuNDArray_blas.h"
+#include "opticalFlowSolver.h"
+#include "gpureg_export.h"
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> class EXPORTGPUREG cuOpticalFlowSolver 
+    : public opticalFlowSolver< cuNDArray<T>,D >
+  {  
+  public:
+  
+    cuOpticalFlowSolver() : opticalFlowSolver< cuNDArray<T>,D >() {}   
+    virtual ~cuOpticalFlowSolver() {}
+    
+  protected:
+
+    // General tool to set up the block/grid dimensions
+    //
+
+    void setup_grid( dim3 *blockDim, dim3* gridDim, unsigned int number_of_elements, 
+                     unsigned int num_batches = 1, bool use_2d_blocks = false, unsigned int num_unknowns = D);  
+ 
+    // GPU-based computation of the spatial and temporal image gradient
+    //
+    
+    virtual void core_grad_spatial( T *fixed_image, T *moving_image, T *gradient_image, 
+                                    typename uint64d<D>::Type matrix_size_moving, 
+                                    size_t number_of_batches_fixed, 
+                                    size_t number_of_batches_moving );
+    
+    virtual void core_grad_temporal( T *fixed_image, T *moving_image, T *gradient_image, 
+                                     typename uint64d<D>::Type matrix_size_moving, 
+                                     size_t number_of_batches_fixed, 
+                                     size_t number_of_batches_moving );
+  };  
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuResampleOperator.cu b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.cu
new file mode 100644
index 0000000..a2731c1
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.cu
@@ -0,0 +1,107 @@
+#include "cuResampleOperator.h"
+
+#include <thrust/host_vector.h>
+#include <thrust/generate.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h> 
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+
+using namespace thrust;
+
+namespace Gadgetron{
+
+  template<class T, unsigned int D> void 
+  cuResampleOperator<T,D>::mult_MH_preprocess()
+  {
+    this->preprocessed_ = false;
+  
+    // Check if a displacement field has been provided
+    //
+  
+    if( !this->offsets_.get() ){
+      throw cuda_error("cuResampleOperator::mult_MH_preprocess(): displacement field not set.");
+    }
+
+    // Make a device vector wrap of the displacement field
+    //
+
+    std::vector<size_t> _dims_disp = *this->offsets_->get_dimensions(); _dims_disp.pop_back(); 
+    unsigned int num_elements_disp = D;
+    while(!_dims_disp.empty()){
+      num_elements_disp *= _dims_disp.back();
+      _dims_disp.pop_back();
+    }
+  
+    device_vector<REAL> displacements
+      ( device_pointer_cast<REAL>(this->offsets_->get_data_ptr()), 
+        device_pointer_cast<REAL>(this->offsets_->get_data_ptr()+num_elements_disp) );
+  
+    // Make sort keys/values array from the deformation field
+    //
+
+    unsigned int num_elements_sort = num_elements_disp/D;
+  
+    this->lower_bounds_ = device_vector<unsigned int>(num_elements_sort);
+    this->upper_bounds_ = device_vector<unsigned int>(num_elements_sort);
+  
+    this->indices_ = device_vector<unsigned int>(get_num_neighbors()*num_elements_sort);
+    this->weights_ = device_vector<REAL>(get_num_neighbors()*num_elements_sort);
+
+    device_vector<unsigned int> sort_keys = device_vector<unsigned int>
+      (get_num_neighbors()*num_elements_sort);
+  
+    // Fill arrays
+    //
+
+    write_sort_arrays(sort_keys);
+    
+    // Make copy of sort_keys before the sort modifies it
+    //
+
+    device_vector<unsigned int> sort_keys_copy(sort_keys);
+  
+    // Sort (twice since we have two value arrays)
+    //
+
+    sort_by_key(sort_keys.begin(), sort_keys.end(), this->indices_.begin() );
+    sort_by_key(sort_keys_copy.begin(), sort_keys_copy.end(), this->weights_.begin() );
+  
+    // Find start/end indices (buckets) in the two values arrays
+    //
+  
+    counting_iterator<unsigned int> search_begin(0);
+    
+    lower_bound( sort_keys.begin(), sort_keys.end(), 
+		 search_begin, search_begin + num_elements_sort, this->lower_bounds_.begin() );
+  
+    upper_bound( sort_keys.begin(), sort_keys.end(), 
+		 search_begin, search_begin + num_elements_sort, this->upper_bounds_.begin() );
+    
+    this->preprocessed_ = true;
+  }
+
+  template class EXPORTGPUREG cuResampleOperator<float,1>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,1>;
+
+  template class EXPORTGPUREG cuResampleOperator<float,2>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,2>;
+
+  template class EXPORTGPUREG cuResampleOperator<float,3>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,3>;
+
+  template class EXPORTGPUREG cuResampleOperator<float,4>;
+  template class EXPORTGPUREG cuResampleOperator<float_complext,4>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,1>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,1>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,2>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,2>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,3>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,3>;
+
+  template class EXPORTGPUREG cuResampleOperator<double,4>;
+  template class EXPORTGPUREG cuResampleOperator<double_complext,4>;
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuResampleOperator.h b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.h
new file mode 100644
index 0000000..e73f1be
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuResampleOperator.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "cuNDArray_math.h"
+#include "resampleOperator.h"
+#include "gpureg_export.h"
+
+#include <thrust/device_vector.h>
+
+namespace Gadgetron{
+
+  template <class T, unsigned int D>
+  class EXPORTGPUREG cuResampleOperator : public resampleOperator< cuNDArray<typename realType<T>::Type>, cuNDArray<T> >
+  {    
+  public:
+
+    typedef typename realType<T>::Type REAL;
+    
+    cuResampleOperator() : resampleOperator< cuNDArray<REAL>, cuNDArray<T> >() {}
+    virtual ~cuResampleOperator() {}
+  
+    virtual void reset()
+    {
+      lower_bounds_ = thrust::device_vector<unsigned int>();
+      upper_bounds_ = thrust::device_vector<unsigned int>();
+      indices_ = thrust::device_vector<unsigned int>();
+      weights_ = thrust::device_vector<REAL>();
+      resampleOperator< cuNDArray<typename realType<T>::Type>, cuNDArray<T> >::reset();
+    }
+    
+    virtual void mult_MH_preprocess();
+  
+  protected:
+    virtual unsigned int get_num_neighbors() = 0;
+    virtual void write_sort_arrays( thrust::device_vector<unsigned int> &sort_keys ) = 0;
+    
+  protected:
+    thrust::device_vector<unsigned int> lower_bounds_;
+    thrust::device_vector<unsigned int> upper_bounds_;
+    thrust::device_vector<unsigned int> indices_;
+    thrust::device_vector<REAL> weights_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/gpu/cuResampleOperator_macros.h b/toolboxes/registration/optical_flow/gpu/cuResampleOperator_macros.h
new file mode 100644
index 0000000..26072e8
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/cuResampleOperator_macros.h
@@ -0,0 +1,248 @@
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_utils.h"
+#include "complext.h"
+
+/* 
+   This macro definition is a workaround 
+   for missing pure virtual device function support in Cuda.
+   
+   We provide this macro to avoid explicitly duplicating 
+   the code below in every "cuResampleOperator-inherited" class.
+*/
+
+#define DECLARE_CU_RESAMPLE_OPERATOR_SUPPORT(COMPONENT)                 \
+                                                                        \
+  template<class T, unsigned int D> __global__ void                     \
+  mult_M_kernel_batch( T *in, T *out,                                   \
+                       typename realType<T>::Type *displacements,       \
+                       typename uintd<D>::Type matrix_size, unsigned int num_batches ) \
+  {                                                                     \
+    typedef typename realType<T>::Type REAL;                            \
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x; \
+    const unsigned int num_elements = prod(matrix_size);                \
+                                                                        \
+    if( idx < num_elements*num_batches ){                               \
+                                                                        \
+      const unsigned int batch_no = idx/num_elements;                   \
+      const unsigned int idx_in_batch = idx-batch_no*num_elements;      \
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size ); \
+                                                                        \
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co); \
+      for( unsigned int dim=0; dim<D; dim++ )                           \
+        co_disp.vec[dim] +=  displacements[dim*num_elements+idx_in_batch]; \
+                                                                        \
+      out[idx] = interpolate<T,D>( batch_no, co_disp, matrix_size, in ); \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> __global__ void                     \
+  mult_M_kernel_extended( T *in, T *out,                                \
+                          typename realType<T>::Type *displacements,    \
+                          typename uintd<D>::Type matrix_size,          \
+                          unsigned int num_elements_in,                 \
+                          unsigned int extended_size )                  \
+  {                                                                     \
+    typedef typename realType<T>::Type REAL;                            \
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x; \
+    const unsigned int num_elements_mat = prod(matrix_size);            \
+    const unsigned int num_elements_ext = prod(matrix_size)*extended_size; \
+                                                                        \
+    if( idx < num_elements_ext ){                                       \
+                                                                        \
+      const unsigned int batch_no = idx/num_elements_mat;               \
+      const unsigned int idx_in_batch = idx-batch_no*num_elements_mat;	\
+                                                                        \
+      const typename uintd<D>::Type co = idx_to_co<D>( idx_in_batch, matrix_size ); \
+                                                                        \
+      typename reald<REAL,D>::Type co_disp = vector_td<REAL,D>(co); \
+      for( unsigned int dim=0; dim<D; dim++ )                           \
+        co_disp.vec[dim] +=  displacements[dim*num_elements_ext+batch_no*num_elements_mat+idx_in_batch]; \
+                                                                        \
+      out[idx] = interpolate<T,D>( (idx >= num_elements_in) ? 0 : batch_no, \
+                                   co_disp, matrix_size, in );          \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> void                                \
+  cu##COMPONENT<T,D>::mult_M( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate ) \
+  {                                                                     \
+    if( !in || !out ){                                                  \
+      throw cuda_error("cuResampleOperator::mult_M(): illegal input/output array."); \
+    }                                                                   \
+                                                                        \
+    if( !this->offsets_.get() ){                                        \
+      throw cuda_error("cuResampleOperator::mult_M(): displacement field not set."); \
+    }                                                                   \
+                                                                        \
+    cuNDArray<T> tmp;                                                   \
+    if( accumulate ){                                                   \
+      tmp = *out;                                                       \
+    }                                                                   \
+                                                                        \
+    unsigned int num_disp_vectors = this->get_number_of_displacement_vectors(); \
+    int surplus = this->offsets_->get_number_of_dimensions()-D;         \
+                                                                        \
+    if( !( surplus == 1 || surplus == 2) || this->offsets_->get_size(D-1+surplus) < D ){ \
+      throw cuda_error("cuResampleOperator::mult_M(): unexpected dimensions of displacement field."); \
+    }                                                                   \
+                                                                        \
+    if( surplus == 1 ){                                                 \
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){ \
+        throw cuda_error("cuResampleOperator::mult_M(): in/out array dimensions mismatch (1)."); \
+      }                                                                 \
+      if( (in->get_number_of_elements() % num_disp_vectors ) != 0 ){    \
+        throw cuda_error("cuResampleOperator::mult_M(): in/out array dimensions mismatch displacement field."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if( surplus == 2 ){                                                 \
+      if( (out->get_number_of_elements() % in->get_number_of_elements()) != 0 ){ \
+        throw cuda_error("cuResampleOperator::mult_M(): in/out array dimensions mismatch (2)."); \
+      }                                                                 \
+      if( out->get_number_of_dimensions() != (D+1) || out->get_number_of_elements() != num_disp_vectors ){ \
+        throw cuda_error("cuResampleOperator::mult_M(): output array dimensions mismatch displacement field."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>(*in->get_dimensions().get()); \
+    unsigned int num_elements_mat = prod(matrix_size);                  \
+    unsigned int num_batches = (surplus == 2) ? 1 : in->get_number_of_elements() / num_elements_mat; \
+    unsigned int extended_dim = (surplus == 1) ? 1 : out->get_size(D);	\
+                                                                        \
+    dim3 blockDim, gridDim;                                             \
+                                                                        \
+    if( surplus == 1 ){                                                 \
+      setup_grid( num_elements_mat, &blockDim, &gridDim, num_batches ); \
+    }                                                                   \
+    else{                                                               \
+      setup_grid( num_elements_mat*extended_dim, &blockDim, &gridDim ); \
+    }                                                                   \
+                                                                        \
+    if( surplus == 1 ) {                                                \
+      mult_M_kernel_batch<T,D><<< gridDim, blockDim >>>                 \
+        ( in->get_data_ptr(), out->get_data_ptr(),                      \
+          this->offsets_->get_data_ptr(), vector_td<unsigned int,D>(matrix_size), num_batches ); \
+    }                                                                   \
+    else{                                                               \
+      mult_M_kernel_extended<T,D><<< gridDim, blockDim >>>              \
+        ( in->get_data_ptr(), out->get_data_ptr(), this->offsets_->get_data_ptr(), \
+          vector_td<unsigned int,D>(matrix_size), in->get_number_of_elements(), extended_dim ); \
+    }                                                                   \
+                                                                        \
+    CHECK_FOR_CUDA_ERROR();                                             \
+                                                                        \
+    if( accumulate ){                                                   \
+      *out += tmp;                                                      \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> __global__ void                     \
+  mult_MH_kernel( T *in, T *out, typename realType<T>::Type *weights,   \
+                  unsigned int *indices, unsigned int *lower_bounds, unsigned int *upper_bounds, \
+                  unsigned int num_elements, unsigned int num_batches ) \
+  {                                                                     \
+    typedef typename realType<T>::Type REAL;                            \
+    const unsigned int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x+threadIdx.x; \
+                                                                        \
+    if( idx < num_elements*num_batches ){                               \
+                                                                        \
+      const unsigned int batch_no = idx/num_elements;                   \
+      const unsigned int idx_in_batch = idx-batch_no*num_elements;      \
+                                                                        \
+      const unsigned int lower_bound = lower_bounds[idx_in_batch];      \
+      const unsigned int upper_bound = upper_bounds[idx_in_batch];      \
+                                                                        \
+      T val = T(0);                                                     \
+                                                                        \
+      if( lower_bound > upper_bound ||                                  \
+          lower_bound >= (_get_num_neighbors<D>()*num_elements) ||      \
+          upper_bound >= (_get_num_neighbors<D>()*num_elements) ){      \
+                                                                        \
+        out[idx] = T(0);                                                \
+        return;                                                         \
+      }                                                                 \
+                                                                        \
+      for( unsigned int i=lower_bound; i<upper_bound; i++ ){            \
+        unsigned int in_idx = indices[i];                               \
+        if( in_idx >= num_elements ){                                   \
+          val = T(0);                                                   \
+          continue;                                                     \
+        }                                                               \
+        REAL weight = weights[i];                                       \
+        val += (in[in_idx+batch_no*num_elements]*weight);               \
+      }                                                                 \
+      out[idx] = val;                                                   \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  template<class T, unsigned int D> void                                \
+  cu##COMPONENT<T,D>::mult_MH( cuNDArray<T> *in, cuNDArray<T> *out, bool accumulate ) \
+  {                                                                     \
+    if( !in || !out ){                                                  \
+      throw cuda_error("cuResampleOperator::mult_MH(): illegal input/output array."); \
+    }                                                                   \
+                                                                        \
+    if( !this->preprocessed_ ){                                         \
+      throw cuda_error("cuResampleOperator::mult_MH(): preprocessing has not been performed."); \
+    }                                                                   \
+                                                                        \
+    cuNDArray<T> tmp;                                                   \
+    if( accumulate ){                                                   \
+      tmp = *out;                                                       \
+    }                                                                   \
+                                                                        \
+    unsigned int num_disp_vectors = this->get_number_of_displacement_vectors(); \
+    int surplus = this->offsets_->get_number_of_dimensions()-D;         \
+                                                                        \
+    if( surplus == 1 ){                                                 \
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){ \
+        throw cuda_error("cuResampleOperator::mult_MH(): in/out array dimensions mismatch (1)."); \
+      }                                                                 \
+      if( (in->get_number_of_elements() % num_disp_vectors ) != 0 ){    \
+        throw cuda_error("cuResampleOperator::mult_MH(): in/out array dimensions mismatch displacement field (1)."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if( surplus == 2 ){                                                 \
+      if( (in->get_number_of_elements() % out->get_number_of_elements()) != 0 ){ \
+        throw cuda_error("cuResampleOperator::mult_MH(): in/out array dimensions mismatch (2)."); \
+      }                                                                 \
+      if( in->get_number_of_dimensions() != (D+1) || in->get_number_of_elements() != num_disp_vectors ){ \
+        throw cuda_error("cuResampleOperator::mult_MH(): output array dimensions mismatch displacement field."); \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    cuNDArray<T> *tmp_out = out; bool mod_out = false;                  \
+    if( surplus == 2 && (in->get_number_of_elements()/out->get_number_of_elements()) > 1 ){ \
+      mod_out = true;                                                   \
+      tmp_out = new cuNDArray<T>(in->get_dimensions().get());           \
+    }                                                                   \
+                                                                        \
+    typename uint64d<D>::Type matrix_size = from_std_vector<size_t,D>( *this->offsets_->get_dimensions().get() ); \
+    unsigned int num_batches = (surplus == 2) ? 1 : in->get_number_of_elements() / prod(matrix_size); \
+    unsigned int extended_dim = (surplus == 1) ? 1 : in->get_size(D);   \
+    unsigned int num_elements = prod(matrix_size)*extended_dim;         \
+                                                                        \
+    dim3 blockDim, gridDim;                                             \
+                                                                        \
+    setup_grid( num_elements, &blockDim, &gridDim, num_batches );       \
+    mult_MH_kernel<T,D><<< gridDim, blockDim >>>                        \
+      ( in->get_data_ptr(), tmp_out->get_data_ptr(),                    \
+        raw_pointer_cast(&this->weights_[0]), raw_pointer_cast(&this->indices_[0]), \
+        raw_pointer_cast(&this->lower_bounds_[0]), raw_pointer_cast(&this->upper_bounds_[0]), \
+        num_elements, num_batches );                                    \
+                                                                        \
+    if( mod_out ){                                                      \
+      *out = *sum<T>( tmp_out, D );                                     \
+      delete tmp_out;                                                   \
+    }                                                                   \
+                                                                        \
+    CHECK_FOR_CUDA_ERROR();                                             \
+                                                                        \
+    if( accumulate ){                                                   \
+      *out += tmp;                                                      \
+    }                                                                   \
+  }
diff --git a/toolboxes/registration/optical_flow/gpu/gpureg_export.h b/toolboxes/registration/optical_flow/gpu/gpureg_export.h
new file mode 100644
index 0000000..6690a4c
--- /dev/null
+++ b/toolboxes/registration/optical_flow/gpu/gpureg_export.h
@@ -0,0 +1,14 @@
+#ifndef _GPUREG_EXPORT_H_
+#define _GPUREG_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUREG__) || defined (gpureg_EXPORTS)
+#define EXPORTGPUREG __declspec(dllexport)
+#else
+#define EXPORTGPUREG __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUREG
+#endif
+
+#endif /* _GPUREG_EXPORT_H_ */
diff --git a/toolboxes/registration/optical_flow/multiresRegistrationSolver.h b/toolboxes/registration/optical_flow/multiresRegistrationSolver.h
new file mode 100644
index 0000000..90784de
--- /dev/null
+++ b/toolboxes/registration/optical_flow/multiresRegistrationSolver.h
@@ -0,0 +1,263 @@
+/** \file multiresRegistrationSolver.h
+    Abstract class defining a multiresolution registration solver.
+    Pure virtual functions are expected to do the actual work.
+*/
+
+#pragma once
+
+#include "registrationSolver.h"
+#include "vector_td_utilities.h"
+#include "vector_td_operators.h"
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE_REAL, unsigned int D> class multiresRegistrationSolver
+    : public registrationSolver<ARRAY_TYPE_REAL>
+  {
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+
+  public:
+
+    multiresRegistrationSolver() : registrationSolver<ARRAY_TYPE_REAL>(){
+      num_multires_levels_ = 0;
+      max_num_iterations_per_level_ = 500;
+    }
+
+    virtual ~multiresRegistrationSolver() {}
+
+    // Utilities to specify the registration settings
+    //
+
+    virtual void set_num_multires_levels( unsigned int levels ) {
+      num_multires_levels_ = levels; }
+
+    virtual void set_max_num_iterations_per_level( unsigned int iterations ) {
+      max_num_iterations_per_level_ = iterations; }
+
+    //
+    // The main solver interface
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> solve( registrationData<ARRAY_TYPE_REAL> *rd )
+    {
+      return registrationSolver<ARRAY_TYPE_REAL>::solve(rd);
+    }
+  
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> solve(
+                                                     ARRAY_TYPE_REAL *fixed_image,
+                                                     ARRAY_TYPE_REAL *moving_image,
+                                                     bool input_normalization_allowed = false  )
+    {
+      // Some initial validity tests
+      //
+
+      if( !fixed_image || !moving_image ){
+        throw std::runtime_error("multiresRegistrationSolver::solve : invalid input pointer.");
+      }
+
+      if( !this->interpolator_.get() ){
+        throw std::runtime_error("multiresRegistrationSolver::solve : interpolator not set.");
+      }
+
+      typename uint64d<D>::Type fixed_dims = from_std_vector<size_t,D>(*moving_image->get_dimensions());
+      typename uint64d<D>::Type moving_dims = from_std_vector<size_t,D>(*fixed_image->get_dimensions());
+
+      if(!(fixed_dims == moving_dims)){
+        throw std::runtime_error("multiresRegistrationSolver::solve : fixed/moving image base dimensions mismatch.");
+      }
+
+      if( weak_less_equal(fixed_dims>>num_multires_levels_, vector_td<size_t, D>(1)) ){
+        throw std::runtime_error("multiresRegistrationSolver::solve : too many multiresolution levels for image dimensionality.");
+      }
+
+      // Normalize the input
+      //
+
+      ARRAY_TYPE_REAL *normalized_fixed;
+      ARRAY_TYPE_REAL *normalized_moving;
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> garbage_collector_fixed, garbage_collector_moving;
+      bool use_padding = padding_required(fixed_dims);
+
+      if( input_normalization_allowed ){
+        if( use_padding ){
+          throw std::runtime_error("multiresRegistrationSolver::solve : input normalization not possible as image padding is required.");
+        }
+        else{
+          normalized_fixed = fixed_image;
+          normalized_moving = moving_image;
+        }
+      }
+      else{
+        if( use_padding ){
+          garbage_collector_fixed = pad<REAL,D>(round_pow2(fixed_dims), fixed_image);
+          garbage_collector_moving = pad<REAL,D>(round_pow2(moving_dims), moving_image);
+          normalized_fixed = garbage_collector_fixed.get();
+          normalized_moving = garbage_collector_moving.get();
+        }
+        else{
+          normalized_fixed = new ARRAY_TYPE_REAL(*fixed_image);
+          normalized_moving = new ARRAY_TYPE_REAL(*moving_image);
+          garbage_collector_fixed = boost::shared_ptr<ARRAY_TYPE_REAL>(normalized_fixed);
+          garbage_collector_moving = boost::shared_ptr<ARRAY_TYPE_REAL>(normalized_moving);
+        }
+      }
+
+      normalize(normalized_fixed, REAL(1));
+      normalize(normalized_moving, REAL(1));
+
+      // Invoke multi-resolution solver
+      //
+
+      if( this->output_mode_ >= registrationSolver<ARRAY_TYPE_REAL>::OUTPUT_VERBOSE ) {
+        std::cout << std::endl << "Starting multiresolution registration " <<  std::endl;
+      }
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> result =
+        solveMultiRes( num_multires_levels_, normalized_fixed, normalized_moving, this->stencil_.get() );
+
+      if( use_padding ){
+        result = crop<REAL,D>( (round_pow2(fixed_dims)-fixed_dims)>>2, fixed_dims, result.get());
+      }
+
+      return result;
+    }
+
+  protected:
+
+    // Pure virtual fuctions to be implemented in a subclass
+    //
+
+    virtual void compute( ARRAY_TYPE_REAL *fixed_image, ARRAY_TYPE_REAL *moving_image, ARRAY_TYPE_REAL *stencil_image, 
+                          boost::shared_ptr<ARRAY_TYPE_REAL> &result ) = 0;
+
+    // The recursive multi-resolution solver
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> solveMultiRes(
+                                                             unsigned int res_level,
+                                                             ARRAY_TYPE_REAL *fixed_image,
+                                                             ARRAY_TYPE_REAL *moving_image,
+                                                             ARRAY_TYPE_REAL *stencil_image )
+    {
+      boost::shared_ptr<ARRAY_TYPE_REAL> result;
+
+      if (res_level>0){
+
+        //
+        // We are not yet at the end of the multi-resolution chain
+        //
+
+        // Downsample input images (and stencil if provided)
+        //
+
+        boost::shared_ptr<ARRAY_TYPE_REAL> fixed_image_lowres = downsample<REAL,D>(fixed_image);
+        boost::shared_ptr<ARRAY_TYPE_REAL> moving_image_lowres = downsample<REAL,D>(moving_image);
+        boost::shared_ptr<ARRAY_TYPE_REAL> stencil_image_lowres =
+          ((stencil_image) ? downsample<REAL,D>(stencil_image) : boost::shared_ptr<ARRAY_TYPE_REAL>());
+
+        // Compute displacement field at the downsampled resolution
+        //
+
+        boost::shared_ptr<ARRAY_TYPE_REAL> result_lowres =
+          solveMultiRes( res_level-1, fixed_image_lowres.get(), moving_image_lowres.get(), stencil_image_lowres.get() );
+
+        // Clean up low resolution image data
+        //
+
+        fixed_image_lowres.reset();
+        moving_image_lowres.reset();
+        stencil_image_lowres.reset();
+
+        // Upsample lowres results to current resolution
+        //
+
+        result = upsample<REAL,D>(result_lowres.get());
+        *result *= REAL(2); // To adjust the flow vectors to the fact that the resolution is now twice as high
+
+        // Clean up low resolution result
+        //
+
+        result_lowres.reset();
+
+        // Some output to track our progress at runtime
+        //
+
+        if( this->output_mode_ >= registrationSolver<ARRAY_TYPE_REAL>::OUTPUT_VERBOSE ) {
+          std::cout << std::endl << "Multiresolution level " << res_level;
+        }
+
+        // Use estimated (lowres) motion to compute displacements at the current resolution
+        //
+
+        boost::shared_ptr<ARRAY_TYPE_REAL> def_moving_image = this->deform( moving_image, result );
+      
+        // Compute registationnat the current multiresolution level
+        //
+
+        compute( fixed_image, def_moving_image.get(), stencil_image, result );
+      }	
+      else{
+
+        //
+        // We are now at the end of the multi-resolution chain
+        //
+
+        // Some output to track our progress at runtime
+        //
+
+        if( this->output_mode_ >= registrationSolver<ARRAY_TYPE_REAL>::OUTPUT_VERBOSE ) {
+          std::cout << std::endl << "Multiresolution level " << res_level << " (lowest)";
+        }
+
+        // Compute displacements at the current resolution (no estimate can be provided)
+        //
+
+        compute( fixed_image, moving_image, stencil_image, result );
+      }
+
+      return result;
+    }
+
+    virtual bool padding_required( typename uint64d<D>::Type dims )
+    {
+      bool padding_required = false;
+      typename uint64d<D>::Type ones(1);
+      typename uint64d<D>::Type twos(2);
+
+      for( unsigned int i=0; i<num_multires_levels_; i++ ){
+
+        dims /= (size_t)2;
+
+        if( weak_less( dims, (size_t)12*ones ) ){
+          throw std::runtime_error("multiresRegistrationSolver::padding_required : resolution too low. Too many multiresolution levels specified?");
+        }
+
+        if( weak_equal(dims%twos, ones) ){
+          padding_required = true;
+        }
+      }
+      return padding_required;
+    }
+
+  protected:
+    unsigned int num_multires_levels_;
+    unsigned int max_num_iterations_per_level_;
+
+  private:
+    typename uint64d<D>::Type round_pow2(typename uint64d<D>::Type v)
+    {
+      typename uint64d<D>::Type ones(1);
+      typename uint64d<D>::Type out = v-ones;
+      for( unsigned int d=0; d<D; d++ ){
+        out[d] |= out[d] >> 1;
+        out[d] |= out[d] >> 2;
+        out[d] |= out[d] >> 4;
+        out[d] |= out[d] >> 8;
+        out[d] |= out[d] >> 16;
+      }
+      return out+ones;
+    }
+  };
+}
diff --git a/toolboxes/registration/optical_flow/opticalFlowOperator.h b/toolboxes/registration/optical_flow/opticalFlowOperator.h
new file mode 100644
index 0000000..e07b6bb
--- /dev/null
+++ b/toolboxes/registration/optical_flow/opticalFlowOperator.h
@@ -0,0 +1,72 @@
+#pragma once
+#include "linearOperator.h"
+#include <numeric>
+#include <functional>
+namespace Gadgetron {
+
+  template<class ARRAY_TYPE,class partialDerivOp, unsigned int D> class opticalFlowOperator : public linearOperator<ARRAY_TYPE >{
+  public:
+    typedef typename ARRAY_TYPE::element_type T;
+
+    opticalFlowOperator(){};
+
+    opticalFlowOperator(ARRAY_TYPE* moving,ARRAY_TYPE* stat){
+      set_images(moving,stat);
+    }
+
+    virtual ~opticalFlowOperator(){};
+
+    virtual void mult_M(ARRAY_TYPE* in,ARRAY_TYPE* out,bool accumulate){
+
+      if (!accumulate) clear(out);
+      std::vector<size_t> dims = *in->get_dimensions();
+      if (dims.back() != D) throw std::runtime_error("Input array for optical flow has the wrong last dimensions");
+      dims.pop_back();
+
+      size_t elements = std::accumulate(dims.begin(),dims.end(),1u,std::multiplies<size_t>());
+
+      for (int i = 0; i < D; i++){
+	ARRAY_TYPE tmp(&dims,in->get_data_ptr()+elements*i);
+	ARRAY_TYPE tmp2(tmp);
+	tmp2 *= *Ix[i];
+	*out += tmp2;
+      }
+    }
+
+    virtual void mult_MH(ARRAY_TYPE* in,ARRAY_TYPE* out,bool accumulate){
+
+      if (!accumulate) clear(out);
+      std::vector<size_t> dims = *out->get_dimensions();
+      if (dims.back() != D) throw std::runtime_error("Output array for optical flow has the wrong last dimensions");
+      dims.pop_back();
+      size_t elements = std::accumulate(dims.begin(),dims.end(),1u,std::multiplies<size_t>());
+
+      for (int i = 0; i < D; i++){
+	ARRAY_TYPE out_view(&dims,out->get_data_ptr()+elements*i);
+	ARRAY_TYPE tmp2(in);
+	tmp2 *= *Ix[i];
+	out_view += tmp2;
+      }
+    }
+
+    void set_images(ARRAY_TYPE* moving,ARRAY_TYPE* stat){
+      Ix = std::vector< boost::shared_ptr<ARRAY_TYPE> >();
+
+      for (int i=0; i < D; i++){
+	partialDerivOp op(i);
+	boost::shared_ptr<ARRAY_TYPE> I(new ARRAY_TYPE(moving->get_dimensions()));
+	op.mult_M(moving,I.get());
+	op.mult_M(stat,I.get(),true);
+	*I /= T(2);
+	Ix.push_back(I);
+      }
+    }
+
+    virtual boost::shared_ptr< linearOperator<ARRAY_TYPE > > clone()
+      {
+	return linearOperator< ARRAY_TYPE >::clone(this);
+      }
+  protected:
+    std::vector< boost::shared_ptr<ARRAY_TYPE> > Ix; //Gradient along different directions
+  };
+}
diff --git a/toolboxes/registration/optical_flow/opticalFlowSolver.h b/toolboxes/registration/optical_flow/opticalFlowSolver.h
new file mode 100644
index 0000000..255116c
--- /dev/null
+++ b/toolboxes/registration/optical_flow/opticalFlowSolver.h
@@ -0,0 +1,176 @@
+/** \file opticalFlowSolver.h
+    \brief Abstract class defining an optical flow registration solver.
+
+    Pure virtual functions are expected to do the actual work 
+    - on the CPU and GPU respectively.
+*/
+
+#pragma once
+
+#include "multiresRegistrationSolver.h"
+#include "resampleOperator.h"
+#include "vector_td_utilities.h"
+
+#include <algorithm>
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE_REAL, unsigned int D> class opticalFlowSolver 
+    : public multiresRegistrationSolver<ARRAY_TYPE_REAL,D>
+  {  
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+
+  public:
+
+    opticalFlowSolver() : multiresRegistrationSolver<ARRAY_TYPE_REAL,D>(){ 
+      limit_ = REAL(0.01);
+    } 
+
+    virtual ~opticalFlowSolver() {}
+
+    // Set termination threshold
+    inline void set_limit( REAL limit ) { limit_ = limit; }
+
+  protected:
+
+    // Inherited from the multiresolution solver
+    //
+
+    virtual void compute( ARRAY_TYPE_REAL *fixed_image, ARRAY_TYPE_REAL *moving_image, ARRAY_TYPE_REAL *stencil_image, 
+                          boost::shared_ptr<ARRAY_TYPE_REAL> &result_in_out )
+    {
+      // Test the validity of the input images
+      //
+
+      if( !fixed_image || !moving_image ){
+        throw std::runtime_error("opticalFlowSolver::compute(): illegal input array received.");
+      }
+
+      if( prod(from_std_vector<size_t,D>(*fixed_image->get_dimensions().get())) != 
+          prod(from_std_vector<size_t,D>(*moving_image->get_dimensions().get())) ){
+        throw std::runtime_error("opticalFlowSolver::compute(): core image dimensions (excluding batches) mismatch.");
+      }
+
+      if( stencil_image && 
+          prod(from_std_vector<size_t,D>(*fixed_image->get_dimensions().get())) != 
+          prod(from_std_vector<size_t,D>(*stencil_image->get_dimensions().get())) ){
+        throw std::runtime_error("opticalFlowSolver::compute(): stencil image dimensions mismatch fixed/moving image dimensions.");
+      }
+
+      if( result_in_out.get() && 
+          !( result_in_out->get_number_of_dimensions() > D ||
+             result_in_out->get_size(result_in_out->get_number_of_dimensions()-1) == D )){
+        throw std::runtime_error("opticalFlowSolver::compute(): input displacements dimensionality mismatch");
+      }
+
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> grad_image = grad( fixed_image, moving_image );
+
+
+      // Invoke core solver (e.g. Horn-Schunk, Cornelius-Kanade, ...)
+      //
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> displacements = core_solver( grad_image.get(), stencil_image );
+
+      // If an input vector field was provided then our result should be added element-wise
+      // 
+
+      if( result_in_out.get() ){
+        *result_in_out += *displacements;
+      }
+      else{    
+        result_in_out = displacements;
+      }
+    }
+
+    // Compute the gradient
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> grad( ARRAY_TYPE_REAL *fixed_image, ARRAY_TYPE_REAL *moving_image )
+    {
+      // Sanity checks
+      //
+
+      if( !fixed_image || !moving_image ){
+        throw std::runtime_error("opticalFlowSolver::grad(): illegal input received.");
+      }
+
+      if( !((moving_image->get_number_of_elements() % fixed_image->get_number_of_elements()) == 0 ||
+            (fixed_image->get_number_of_elements() % moving_image->get_number_of_elements()) == 0 )){
+        throw std::runtime_error("opticalFlowSolver::grad(): fixed/moving image dimensions mismatch.");
+      }
+
+      // Determine dimension size of the gradient field:
+      // D spatial dimensions plus one temporal dimension
+      //
+
+      std::vector<size_t> grad_dims;
+
+      (fixed_image->get_number_of_elements()<moving_image->get_number_of_elements() )
+        ? grad_dims = *moving_image->get_dimensions() : grad_dims = *fixed_image->get_dimensions();
+
+      grad_dims.push_back(D+1); 
+
+      boost::shared_ptr<ARRAY_TYPE_REAL> grad_image(new ARRAY_TYPE_REAL(&grad_dims));
+
+      // Setup for the spatial partial derivatives
+      //
+
+      typename uint64d<D>::Type matrix_size_fixed = from_std_vector<size_t,D>( *fixed_image->get_dimensions() );
+      typename uint64d<D>::Type matrix_size_moving = from_std_vector<size_t,D>( *moving_image->get_dimensions() );
+
+      if( matrix_size_fixed != matrix_size_moving ){
+        throw std::runtime_error("opticalFlowSolver::grad(): fixed/moving image dimensions mismatch (2).");
+      }
+
+      // Ignoring the batch dimensions the fixed and moving images have the same number of elements
+      //
+
+      size_t number_of_elements = prod(matrix_size_moving);
+      size_t number_of_batches_fixed = 1;
+      size_t number_of_batches_moving = 1;
+
+      for( size_t d=D; d<fixed_image->get_number_of_dimensions(); d++ ){
+        number_of_batches_fixed *= fixed_image->get_size(d);
+      }
+
+      for( size_t d=D; d<moving_image->get_number_of_dimensions(); d++ ){
+        number_of_batches_moving *= moving_image->get_size(d);
+      }
+
+      // Compute spatial partial derivatives
+      //
+
+      core_grad_spatial( fixed_image->get_data_ptr(), moving_image->get_data_ptr(), grad_image->get_data_ptr(), 
+                         matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+
+      // Compute temporal partial derivatives
+      //
+
+      core_grad_temporal( fixed_image->get_data_ptr(), moving_image->get_data_ptr(), 
+                          grad_image->get_data_ptr()+number_of_elements*std::max(number_of_batches_moving, number_of_batches_fixed)*D, 
+                          matrix_size_moving, number_of_batches_fixed, number_of_batches_moving );
+
+      return grad_image;
+    }
+
+    // The actual work is being done in these functions, to be implemented on both host and device
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> core_solver( ARRAY_TYPE_REAL *gradient_image, ARRAY_TYPE_REAL *stencil_image ) = 0;      
+
+    virtual void core_grad_spatial( REAL *fixed_image, REAL *moving_image, REAL *gradient_image, 
+                                    typename uint64d<D>::Type matrix_size_moving, 
+                                    size_t number_of_batches_fixed, 
+                                    size_t number_of_batches_moving ) = 0;
+
+    virtual void core_grad_temporal( REAL *fixed_image, REAL *moving_image, REAL *gradient_image, 
+                                     typename uint64d<D>::Type matrix_size_moving, 
+                                     size_t number_of_batches_fixed, 
+                                     size_t number_of_batches_moving ) = 0;
+
+  protected:
+    REAL limit_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/registrationSolver.h b/toolboxes/registration/optical_flow/registrationSolver.h
new file mode 100644
index 0000000..9e59ef1
--- /dev/null
+++ b/toolboxes/registration/optical_flow/registrationSolver.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "solver.h"
+#include "resampleOperator.h"
+
+namespace Gadgetron{
+  
+  template <class ARRAY_TYPE> class registrationData
+  {
+  public:
+    registrationData( ARRAY_TYPE *fixed_image, ARRAY_TYPE *moving_image )
+    {
+      fixed_image_ = fixed_image;
+      moving_image_ = moving_image;
+    }
+
+    virtual ~registrationData() {}
+  
+    inline ARRAY_TYPE* get_fixed_image () { return fixed_image_; }
+    inline ARRAY_TYPE* get_moving_image () { return moving_image_; }
+  
+  protected:
+    ARRAY_TYPE *fixed_image_;
+    ARRAY_TYPE *moving_image_;
+  };
+
+  template <class ARRAY_TYPE> class registrationSolver 
+    : public solver<registrationData<ARRAY_TYPE>, ARRAY_TYPE >
+  {
+  public:
+
+    // Constructor/destructor
+    //
+
+    registrationSolver() : solver<registrationData<ARRAY_TYPE>,ARRAY_TYPE>() {}
+    virtual ~registrationSolver() {}
+
+    // Set interpolator for resampling
+    //
+  
+    inline void set_interpolator( boost::shared_ptr< resampleOperator<ARRAY_TYPE,ARRAY_TYPE> > interpolator )
+    {
+      interpolator_ = interpolator;
+    }
+  
+    // Set zero deformation boundary condition as a stencil image
+    //
+  
+    inline void set_stencil( boost::shared_ptr<ARRAY_TYPE> stencil )
+    {
+      stencil_ = stencil;
+    }
+  
+    //
+    // The solver adds a dimension to ARRAY_TYPE to hold the vector result.
+    // I.e. the vector field dimension is the slowest varying.
+    //
+  
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    solve( ARRAY_TYPE *fixed_image, ARRAY_TYPE *moving_image, bool input_normalization_allowed = false ) = 0;
+  
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    solve( registrationData< ARRAY_TYPE> *rd )
+    {
+      return solve( rd->get_fixed_image(), rd->get_moving_image() );
+    }
+  
+    // Deform image based on displacement field
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    deform( ARRAY_TYPE *image, boost::shared_ptr<ARRAY_TYPE> displacements )
+    {
+      if( !interpolator_.get() ){
+	    throw std::runtime_error("registrationSolver::deform() : interpolator not set");;
+      }
+    
+      boost::shared_ptr<ARRAY_TYPE> out(new ARRAY_TYPE);
+      std::vector<size_t> out_dims = *displacements->get_dimensions().get(); out_dims.pop_back();    
+      out->create(&out_dims);
+    
+      interpolator_->set_displacement_field( displacements );
+      interpolator_->mult_M( image, out.get() );
+      interpolator_->reset();
+    
+      return out;
+    }
+  
+    // Deform image based on an invocation of the registration solver
+    //
+  
+    virtual boost::shared_ptr<ARRAY_TYPE> 
+    deform( ARRAY_TYPE *fixed_image, ARRAY_TYPE *moving_image )
+    {
+      boost::shared_ptr<ARRAY_TYPE> displacements = solve( fixed_image, moving_image );
+      return deform( moving_image, displacements );
+    }
+  
+  protected:
+    boost::shared_ptr< resampleOperator<ARRAY_TYPE,ARRAY_TYPE> > interpolator_;
+    boost::shared_ptr<ARRAY_TYPE> stencil_;
+  };
+}
diff --git a/toolboxes/registration/optical_flow/resampleOperator.h b/toolboxes/registration/optical_flow/resampleOperator.h
new file mode 100644
index 0000000..4d0139a
--- /dev/null
+++ b/toolboxes/registration/optical_flow/resampleOperator.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "linearOperator.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE_REAL, class ARRAY_TYPE_ELEMENT> 
+  class resampleOperator : public linearOperator<ARRAY_TYPE_ELEMENT>
+  {
+  public:
+  
+    resampleOperator() : linearOperator<ARRAY_TYPE_ELEMENT>(), preprocessed_(false) {}
+    virtual ~resampleOperator() {}
+
+    virtual void reset(){ preprocessed_ = false; }
+  
+    // Expected format: the vector field dimension should be the slowest varying
+    //
+
+    virtual void set_displacement_field( boost::shared_ptr<ARRAY_TYPE_REAL> offsets )
+    {
+      offsets_ = offsets;
+    }
+  
+    virtual boost::shared_ptr<ARRAY_TYPE_REAL> get_displacement_field()
+    {
+      return offsets_;
+    }
+
+    virtual size_t get_number_of_displacement_vectors() 
+    {
+      if( !offsets_.get() ) return 0;
+      return offsets_->get_number_of_elements()/offsets_->get_size(offsets_->get_number_of_dimensions()-1);
+    }
+
+    virtual bool is_preprocessed(){ return preprocessed_; }
+
+  protected:
+    bool preprocessed_;
+    boost::shared_ptr<ARRAY_TYPE_REAL> offsets_;
+  };
+}
diff --git a/toolboxes/solvers/CMakeLists.txt b/toolboxes/solvers/CMakeLists.txt
new file mode 100644
index 0000000..92a0a0e
--- /dev/null
+++ b/toolboxes/solvers/CMakeLists.txt
@@ -0,0 +1,27 @@
+include_directories(
+  ${Boost_INCLUDE_DIR}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/cpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators
+  )
+
+install(FILES 	
+  solver.h
+  linearOperatorSolver.h
+  cgSolver.h
+  sbSolver.h
+  sbcSolver.h
+  cgCallback.h	
+  cgPreconditioner.h
+  lwSolver.h
+  gpSolver.h
+  gpBbSolver.h
+  DESTINATION include)
+
+IF(ARMADILLO_FOUND)
+  add_subdirectory(cpu)
+ENDIF(ARMADILLO_FOUND)
+
+IF( CUDA_FOUND)
+  add_subdirectory(gpu)
+ENDIF (CUDA_FOUND)
diff --git a/toolboxes/solvers/cgCallback.h b/toolboxes/solvers/cgCallback.h
new file mode 100644
index 0000000..f4b3e3f
--- /dev/null
+++ b/toolboxes/solvers/cgCallback.h
@@ -0,0 +1,198 @@
+/** \file cgCallback.h
+    \brief Class to specify the termination criteria for the conjugate gradient solver through a callback mechanism.
+*/
+
+#pragma once
+
+#include "real_utilities.h"
+#include "cgSolver.h"
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class cgSolver;
+
+  template <class ARRAY_TYPE> class cgTerminationCallback
+  {
+
+  public:
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+    cgTerminationCallback() {}
+    virtual ~cgTerminationCallback() {}
+  
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg ){cg_ = cg; return true;}
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate ) = 0;
+
+  protected:
+
+    cgSolver<ARRAY_TYPE> *cg_;
+
+    REAL get_rq(){
+      return cg_->rq_;
+    }
+
+    REAL get_rq0(){
+      return cg_->rq0_;
+    }
+
+    REAL get_alpha(){
+      return cg_->alpha_;
+    }
+
+    boost::shared_ptr<ARRAY_TYPE> get_x(){
+      return cg_->x_;
+    }
+
+    boost::shared_ptr<ARRAY_TYPE> get_p(){
+      return cg_->p_;
+    }
+
+    boost::shared_ptr<ARRAY_TYPE> get_r(){
+      return cg_->r_;
+    }
+  };
+
+  template <class ARRAY_TYPE> class relativeResidualTCB
+    : public cgTerminationCallback<ARRAY_TYPE>
+  {
+
+  protected:
+    typedef cgTerminationCallback<ARRAY_TYPE> cgTC;
+
+  public:
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    relativeResidualTCB() : cgTerminationCallback<ARRAY_TYPE>() {
+      rq_0_ = REAL(0); 
+      tc_last_ = get_max<REAL>();
+    }
+  
+    virtual ~relativeResidualTCB() {}
+  
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg )
+    {
+      cgTC::initialize(cg);
+      tc_last_ = get_max<REAL>();
+      rq_0_ = cgTC::get_rq0();
+      return true;
+    }
+  
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      *tc_metric = cgTC::get_rq()/rq_0_;
+    
+      if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_WARNINGS ) {
+	if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+	  std::cout << "Iteration " << iteration << ". rq/rq_0 = " << *tc_metric << std::endl;
+	}
+	if( (tc_last_-(*tc_metric)) < REAL(0) ){
+	  std::cout << "Warning: conjugate gradient residual increase." << std::endl;
+	}
+      }
+    
+      *tc_terminate = ( *tc_metric < cgTC::cg_->get_tc_tolerance() );
+      tc_last_ = *tc_metric;
+      return true;
+    }
+  
+  protected:
+    REAL rq_0_;
+    REAL tc_last_;
+  };
+
+  template <class ARRAY_TYPE> class residualTCB
+    : public cgTerminationCallback<ARRAY_TYPE>
+  {
+
+  protected:
+
+    typedef cgTerminationCallback<ARRAY_TYPE> cgTC;
+
+  public:
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    residualTCB() : cgTerminationCallback<ARRAY_TYPE>() {
+      tc_last_ = get_max<REAL>();
+    }
+
+    virtual ~residualTCB() {}
+
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg )
+    {
+      cgTC::initialize(cg);
+      tc_last_ = get_max<REAL>();
+      return true;
+    }
+
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      *tc_metric = cgTC::get_rq();
+      if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_WARNINGS ) {
+        if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+	  std::cout << "Iteration " << iteration << ". rq/rq_0 = " << *tc_metric << std::endl;
+        }
+        if( (tc_last_-(*tc_metric)) < REAL(0) ){
+	  std::cout << "----- Warning: CG residual increase. Stability problem! -----" << std::endl;
+        }
+      }
+      *tc_terminate = ( *tc_metric < cgTC::cg_->get_tc_tolerance() );
+      tc_last_ = *tc_metric;
+      return true;
+    }
+
+  protected:
+
+    REAL tc_last_;
+  };
+
+  template <class ARRAY_TYPE> class updateTCB
+    : public cgTerminationCallback<ARRAY_TYPE>
+  {
+
+  protected:
+    typedef cgTerminationCallback<ARRAY_TYPE> cgTC;
+
+  public:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+    updateTCB() : cgTerminationCallback<ARRAY_TYPE>() {
+
+      tc_last_ = get_max<REAL>();
+    }
+
+    virtual ~updateTCB() {}
+
+    virtual bool initialize( cgSolver<ARRAY_TYPE> *cg )
+    {
+      cgTC::initialize(cg);
+      tc_last_ = get_max<REAL>();
+      return true;
+    }
+
+    virtual bool iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      *tc_metric = cgTC::cg_->solver_dot(cgTC::get_p().get(),cgTC::get_p().get());
+      if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_WARNINGS ) {
+	if( cgTC::cg_->get_output_mode() >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+	  std::cout << "Iteration " << iteration << ". rq/rq_0 = " << *tc_metric << std::endl;
+	}
+	if( (tc_last_-(*tc_metric)) < REAL(0) ){
+	  std::cout << "----- Warning: CG residual increase. Stability problem! -----" << std::endl;
+	}
+      }
+      *tc_terminate = ( *tc_metric < cgTC::cg_->get_tc_tolerance() );
+      tc_last_ = *tc_metric;
+      return true;
+    }
+
+  protected:
+
+    REAL tc_last_;
+  };
+}
diff --git a/toolboxes/solvers/cgPreconditioner.h b/toolboxes/solvers/cgPreconditioner.h
new file mode 100644
index 0000000..afff8c6
--- /dev/null
+++ b/toolboxes/solvers/cgPreconditioner.h
@@ -0,0 +1,50 @@
+/** \file cgPreconditioner.h
+    \brief Base class for preconditioners for the cgSolver class.
+*/
+
+#ifndef CGPRECONDITIONER_H
+#define CGPRECONDITIONER_H
+#pragma once
+
+#include <boost/shared_ptr.hpp>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class cgPreconditioner
+  {
+  public:
+    
+    cgPreconditioner() {}
+    virtual ~cgPreconditioner() {}
+    
+    virtual void set_weights( boost::shared_ptr<ARRAY_TYPE> w ){
+      weights_ = w;
+    }
+
+    virtual void apply( ARRAY_TYPE *in, ARRAY_TYPE *out )
+    {
+      if( !weights_.get() ){
+	throw std::runtime_error( "cgPreconditioner::apply(): weights not set");
+      }
+      
+      if ( !in || !out || in->get_number_of_elements() != out->get_number_of_elements()) {
+	throw std::runtime_error("cgPreconditioner::apply(): input and output dimensions mismatch");
+      }
+      
+      if (in->get_number_of_elements() % weights_->get_number_of_elements()) {
+	throw std::runtime_error( "cgPreconditioner::apply(): unexpected dimensionality of computed weights" );
+      }
+      *out = *in;
+      *out *= *weights_;
+    };
+    
+    void* operator new (size_t bytes) { return ::new char[bytes]; }
+    void operator delete (void *ptr) { delete [] static_cast <char *> (ptr); } 
+    void * operator new(size_t s, void * p) { return p; }    
+    
+  protected:
+    boost::shared_ptr<ARRAY_TYPE> weights_;    
+  };
+}
+
+#endif //CGPRECONDITIONER_H
diff --git a/toolboxes/solvers/cgSolver.h b/toolboxes/solvers/cgSolver.h
new file mode 100644
index 0000000..c8b5278
--- /dev/null
+++ b/toolboxes/solvers/cgSolver.h
@@ -0,0 +1,412 @@
+/** \file cgSolver.h
+    \brief Base class for the conjugate gradient solvers.
+
+    The file cgSolver.h is a device independent implementation of the conjugate gradient solver.
+    To simplify the actual instantiation we refer to 
+    - the class(/file) hoCgSolver(/.h) for a cpu instantiated solver using the hoNDArray class
+    - the class(/file) cuCgSolver(/.h) for a gpu instantiated solver using the cuNDArray class
+    - the class(/file) hoCuCgSolver(/.h) for a gpu based solver using a host memory interface. 
+
+    The latter version is intended for large reconstructions in which device memory cannot hold 
+    the entire data from the image and encoded image domains. 
+    In the "hoCu" scenario, suitable encoding and regularization operators
+    capable of batching their mult_M and mult_MHM functions should be chosen.
+
+    In all cases, the encoding and regularization operators added to the solver 
+    must adhere to the underlying instantiation of the NDArray data type.
+*/
+
+#pragma once
+
+#include "linearOperatorSolver.h"
+#include "cgCallback.h"
+#include "cgPreconditioner.h"
+#include "real_utilities.h"
+#include "complext.h"
+
+#include <vector>
+#include <iostream>
+#include <limits>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class cgSolver : public linearOperatorSolver<ARRAY_TYPE>
+  {
+  
+  public:
+
+    // Convienient typedefs
+    //
+
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+    friend class cgTerminationCallback<ARRAY_TYPE>;
+
+
+    // Constructor
+    //
+
+    cgSolver() : linearOperatorSolver<ARRAY_TYPE>() {
+      alpha_ = std::numeric_limits<ELEMENT_TYPE>::quiet_NaN();
+      iterations_ = 10;
+      tc_tolerance_ = (REAL)1e-3;
+      cb_ = boost::shared_ptr< relativeResidualTCB<ARRAY_TYPE> >( new relativeResidualTCB<ARRAY_TYPE>() );
+    }
+  
+
+    // Destructor
+    //
+
+    virtual ~cgSolver() {}
+
+
+    // Set preconditioner
+    //
+
+    virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+    }
+  
+
+    // Set termination callback
+    //
+
+    virtual void set_termination_callback( boost::shared_ptr< cgTerminationCallback<ARRAY_TYPE> > cb ){
+      cb_ = cb;
+    }
+  
+
+    // Set/get maximally allowed number of iterations
+    //
+
+    virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+    virtual unsigned int get_max_iterations() { return iterations_; }  
+
+
+    // Set/get tolerance threshold for termination criterium
+    //
+
+    virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+    virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+  
+
+    // Virtual function that is provided with the intermediate solution at each solver iteration.
+    // The default behaviour is to do nothing with this array,
+    // but customization is possible by specialization of the virtual function in a derived class.
+    //
+
+    virtual void solver_dump( ARRAY_TYPE* ) {}
+
+
+    //
+    // Main solver interface
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> solve( ARRAY_TYPE *d )
+    {
+    
+      // Compute right hand side...
+      //
+      
+      boost::shared_ptr<ARRAY_TYPE> rhs = compute_rhs( d );
+
+      // ... and the result
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> result =  solve_from_rhs( rhs.get() );
+      return result;
+    }
+
+
+    // Alternative solver interface when given the right hand side
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> solve_from_rhs( ARRAY_TYPE *rhs ) 
+    {
+      // For zero iterations we have computed / return the right hand side
+      //
+
+      if( iterations_ == 0 ){
+        return boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(*rhs) );
+      }
+
+      // Initialize
+      //
+
+      initialize(rhs);
+
+      // Iterate
+      //
+
+      if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+        std::cout << "Iterating..." << std::endl;
+      }
+    
+      for( unsigned int it=0; it<iterations_; it++ ){
+
+        REAL tc_metric;
+        bool tc_terminate;
+      
+        this->iterate( it, &tc_metric, &tc_terminate );
+
+        solver_dump( x_.get());
+      
+        if( tc_terminate )
+          break;
+      }
+    
+      // Clean up and we are done
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> tmpx = x_;
+      deinitialize();
+      return tmpx;
+    }
+
+
+    // Compute right hand side
+    //
+
+    virtual boost::shared_ptr<ARRAY_TYPE> compute_rhs( ARRAY_TYPE *d )
+    {
+    
+      if( this->encoding_operator_.get() == 0 ){
+      	throw std::runtime_error( "Error: cgSolver::compute_rhs : no encoding operator is set" );
+      } 
+        
+      // Get image space dimensions from the encoding operator
+      //
+
+      boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+      if( image_dims->size() == 0 ){
+      	throw std::runtime_error( "Error: cgSolver::compute_rhs : encoding operator has not set domain dimension" );
+      }
+
+      // Create result array and clear
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> result = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(image_dims.get()));
+      clear(result.get());
+    
+      // Create temporary array
+      //
+
+      ARRAY_TYPE tmp(image_dims.get() );
+
+      // Compute operator adjoint
+      //
+
+      this->encoding_operator_->mult_MH( d, &tmp );
+    
+      // Apply weight
+      //
+
+      axpy(ELEMENT_TYPE(this->encoding_operator_->get_weight()), &tmp, result.get() );
+    
+      return result;
+    }
+
+  protected:
+  
+    //
+    // Everything beyond this point is internal to the implementation
+    // and not intended to be exposed as a public interface
+    //
+
+    // Initialize solver
+    //
+
+    virtual void initialize( ARRAY_TYPE *rhs )
+    {
+      // Input validity test
+      //
+
+      if( !rhs || rhs->get_number_of_elements() == 0 ){
+      	throw std::runtime_error( "Error: cgSolver::initialize : empty or NULL rhs provided" );
+      }
+    
+      // Result, x
+      //
+
+      x_ = boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(rhs->get_dimensions()) );
+    
+    
+      // Initialize r,p,x
+      //
+
+      r_ = boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(*rhs) );
+      p_ = boost::shared_ptr<ARRAY_TYPE>( new ARRAY_TYPE(*r_) );
+    
+      if( !this->get_x0().get() ){ // no starting image provided      
+	clear(x_.get());
+      }
+
+      // Apply preconditioning, twice (should change preconditioners to do this)
+      //
+      
+      if( precond_.get() ) {	
+        precond_->apply( p_.get(), p_.get() );
+        precond_->apply( p_.get(), p_.get() );
+      }
+
+      rq0_ = real(dot( r_.get(), p_.get() ));
+
+      if (this->get_x0().get()){
+	
+        if( !this->get_x0()->dimensions_equal( rhs )){
+          throw std::runtime_error( "Error: cgSolver::initialize : RHS and initial guess must have same dimensions" );
+        }
+	
+        *x_ = *(this->get_x0());
+        
+        ARRAY_TYPE mhmX( rhs->get_dimensions());
+
+        if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ) {
+          std::cout << "Preparing guess..." << std::endl;
+        }
+        
+        mult_MH_M( this->get_x0().get(), &mhmX );
+        
+        *r_ -= mhmX;
+        *p_ = *r_;
+        
+        // Apply preconditioning, twice (should change preconditioners to do this)
+        //
+        
+        if( precond_.get() ){
+          precond_->apply( p_.get(), p_.get() );
+          precond_->apply( p_.get(), p_.get() );
+        }
+      }
+      
+      rq_ = real( dot( r_.get(), p_.get() ));
+      
+      // Invoke termination callback initialization
+      //
+    
+      cb_->initialize(this);
+    }
+  
+    // Clean up
+    //
+
+    virtual void deinitialize()
+    {
+      p_.reset();
+      r_.reset();
+      x_.reset();
+    }
+
+    // Perform full cg iteration
+    //
+
+    virtual void iterate( unsigned int iteration, REAL *tc_metric, bool *tc_terminate )
+    {
+      ARRAY_TYPE q = ARRAY_TYPE(x_->get_dimensions());
+
+      // Perform one iteration of the solver
+      //
+
+      mult_MH_M( p_.get(), &q );
+    
+      // Update solution
+      //
+
+      alpha_ = rq_/dot( p_.get(), &q );
+      axpy( alpha_, p_.get(), x_.get());
+
+      // Update residual
+      //
+
+      axpy( -alpha_, &q, r_.get());
+
+      // Apply preconditioning
+      //
+
+      if( precond_.get() ){
+
+        precond_->apply( r_.get(), &q );
+        precond_->apply( &q, &q );
+        
+        REAL tmp_rq = real(dot( r_.get(), &q ));      
+        *p_ *= ELEMENT_TYPE((tmp_rq/rq_));
+        axpy( ELEMENT_TYPE(1), &q, p_.get() );
+        rq_ = tmp_rq;
+      } 
+      else{
+        
+        REAL tmp_rq = real(dot( r_.get(), r_.get()) );
+        *p_ *= ELEMENT_TYPE((tmp_rq/rq_));           
+        axpy( ELEMENT_TYPE(1), r_.get(), p_.get() );
+        rq_ = tmp_rq;      
+      }
+      
+      // Invoke termination callback iteration
+      //
+
+      if( !cb_->iterate( iteration, tc_metric, tc_terminate ) ){
+        throw std::runtime_error( "Error: cgSolver::iterate : termination callback iteration failed" );
+      }    
+    }
+    
+    // Perform mult_MH_M of the encoding and regularization matrices
+    //
+
+    void mult_MH_M( ARRAY_TYPE *in, ARRAY_TYPE *out )
+    {
+      // Basic validity checks
+      //
+
+      if( !in || !out ){
+        throw std::runtime_error( "Error: cgSolver::mult_MH_M : invalid input pointer(s)" );
+      }
+
+      if( in->get_number_of_elements() != out->get_number_of_elements() ){
+        throw std::runtime_error( "Error: cgSolver::mult_MH_M : array dimensionality mismatch" );
+      }
+    
+      // Intermediate storage
+      //
+
+      ARRAY_TYPE q = ARRAY_TYPE(in->get_dimensions());
+
+      // Start by clearing the output
+      //
+      clear(out);
+
+      // Apply encoding operator
+      //
+
+      this->encoding_operator_->mult_MH_M( in, &q, false );
+      axpy( this->encoding_operator_->get_weight(), &q, out );
+
+      // Iterate over regularization operators
+      //
+
+      for( unsigned int i=0; i<this->regularization_operators_.size(); i++ ){      
+        this->regularization_operators_[i]->mult_MH_M( in, &q, false );
+        axpy( this->regularization_operators_[i]->get_weight(), &q, out );
+      }      
+    }
+    
+  protected:
+
+    // Preconditioner
+    boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+
+    // Termination criterium callback
+    boost::shared_ptr< cgTerminationCallback<ARRAY_TYPE> > cb_;
+
+    // Termination criterium threshold
+    REAL tc_tolerance_;
+
+    // Maximum number of iterations
+    unsigned int iterations_;
+
+    // Internal variables. 
+    REAL rq_;
+    REAL rq0_;
+    ELEMENT_TYPE alpha_;
+    boost::shared_ptr<ARRAY_TYPE> x_, p_, r_;
+  };
+}
diff --git a/toolboxes/solvers/cpu/CMakeLists.txt b/toolboxes/solvers/cpu/CMakeLists.txt
new file mode 100644
index 0000000..390abee
--- /dev/null
+++ b/toolboxes/solvers/cpu/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_CPUSOLVERS__)
+endif (WIN32)
+
+include_directories(
+  ${CMAKE_SOURCE_DIR}/toolboxes/cpucore/
+  ${CMAKE_SOURCE_DIR}/toolboxes/cpucore/arma_math
+  )
+
+install(FILES 	
+  hoCgSolver.h
+  hoSbCgSolver.h
+  hoGpBbSolver.h
+  hoCgPreconditioner.h
+  DESTINATION include)
diff --git a/toolboxes/solvers/cpu/hoCgPreconditioner.h b/toolboxes/solvers/cpu/hoCgPreconditioner.h
new file mode 100644
index 0000000..fea5e38
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoCgPreconditioner.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "hoNDArray_operators.h"
+#include "cgPreconditioner.h"
+
+namespace Gadgetron{
+
+  template<class T> class hoCgPreconditioner : public cgPreconditioner< hoNDArray<T> >
+  {
+  public:    
+    hoCgPreconditioner() : cgPreconditioner< hoNDArray<T> >() {}
+    virtual ~hoCgPreconditioner() {}
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoCgSolver.h b/toolboxes/solvers/cpu/hoCgSolver.h
new file mode 100644
index 0000000..c1ad815
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoCgSolver.h
@@ -0,0 +1,30 @@
+/** \file hoCgSolver.h
+    \brief Instantiation of the conjugate gradient solver on the cpu.
+
+    The file hoCgSolver.h is a convienience wrapper for the device independent cgSolver class.
+    The class hoCgSolver instantiates the cgSolver for the hoNDArray
+    and the header otherwise includes other neccessary header files.
+*/
+
+#pragma once
+
+#include "cgSolver.h"
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_blas.h"
+
+namespace Gadgetron{
+
+  /** \class hoCgSolver
+      \brief Instantiation of the conjugate gradient solver on the cpu.
+      
+      The class hoCgSolver is a convienience wrapper for the device independent cgSolver class.
+      hoCgSolver instantiates the cgSolver for type hoNDArray<T>.
+  */
+  template <class T> class hoCgSolver : public cgSolver< hoNDArray<T> >
+  {
+  public:
+    hoCgSolver() : cgSolver<hoNDArray<T> >() {}
+    virtual ~hoCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoGpBbSolver.h b/toolboxes/solvers/cpu/hoGpBbSolver.h
new file mode 100644
index 0000000..c664b64
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoGpBbSolver.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "gpBbSolver.h"
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_blas.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+  template <class T> class hoGpBbSolver : public gpBbSolver< hoNDArray<T> >
+  {  
+  public:
+
+    hoGpBbSolver() : gpBbSolver< hoNDArray<T> >() {};
+    virtual ~hoGpBbSolver() {};
+        
+    virtual void solver_non_negativity_filter(hoNDArray<T> *xdata, hoNDArray<T> *gdata)
+    {
+      typedef typename realType<T>::Type REAL;
+
+      T* x = xdata->get_data_ptr();
+      T* g = gdata->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( int i=0; i < xdata->get_number_of_elements(); i++ )
+	if( (real(x[i]) <= REAL(0)) && (real(g[i]) > 0) ) 
+	  g[i]=T(0);
+    }
+  };
+}
diff --git a/toolboxes/solvers/cpu/hoSbCgSolver.h b/toolboxes/solvers/cpu/hoSbCgSolver.h
new file mode 100644
index 0000000..8d84929
--- /dev/null
+++ b/toolboxes/solvers/cpu/hoSbCgSolver.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "hoCgSolver.h"
+#include "sbSolver.h"
+
+#include "complext.h"
+
+namespace Gadgetron{
+
+  template <class T> class hoSbCgSolver : public sbSolver< hoNDArray<typename realType<T>::Type >, hoNDArray<T>, hoCgSolver<T> >
+  {
+  public:    
+    hoSbCgSolver() : sbSolver<hoNDArray<typename realType<T>::Type >, hoNDArray<T>, hoCgSolver<T> >() {}    
+    virtual ~hoSbCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpBbSolver.h b/toolboxes/solvers/gpBbSolver.h
new file mode 100644
index 0000000..c801c1d
--- /dev/null
+++ b/toolboxes/solvers/gpBbSolver.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#include "gpSolver.h"
+#include "real_utilities.h"
+#include "complext.h"
+#include "cgPreconditioner.h"
+#include <vector>
+#include <iostream>
+
+namespace Gadgetron{
+
+/* Using adaptive step size from Zhou et al, 2006, Computational Optimization and Applications,
+ * DOI: 10.1007/s10589-006-6446-0
+ */
+
+template <class ARRAY_TYPE> class gpBbSolver : public gpSolver<ARRAY_TYPE>
+{
+protected:
+	typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+	typedef typename realType<ELEMENT_TYPE>::Type REAL;
+	typedef ARRAY_TYPE ARRAY_CLASS;
+
+public:
+
+	gpBbSolver(): gpSolver<ARRAY_TYPE>() {
+		iterations_ = 10;
+		tc_tolerance_ = (REAL)1e-6;
+		non_negativity_constraint_=false;
+		dump_residual = false;
+		threshold= REAL(1e-8);
+	}
+
+	virtual ~gpBbSolver(){}
+
+	virtual boost::shared_ptr<ARRAY_TYPE> solve(ARRAY_TYPE* in)
+    		{
+		if( this->encoding_operator_.get() == 0 ){
+			throw std::runtime_error("Error: gpBbSolver::compute_rhs : no encoding operator is set" );
+		}
+
+		// Get image space dimensions from the encoding operator
+		//
+
+		boost::shared_ptr< std::vector<size_t> > image_dims = this->encoding_operator_->get_domain_dimensions();
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error("Error: gpBbSolver::compute_rhs : encoding operator has not set domain dimension" );
+		}
+
+		ARRAY_TYPE * x = new ARRAY_TYPE;
+		x->create(image_dims.get());
+
+		ARRAY_TYPE x_old(image_dims.get());
+
+		ARRAY_TYPE * g = new ARRAY_TYPE;
+		g->create(image_dims.get());
+		ARRAY_TYPE *  g_old = new ARRAY_TYPE;
+		g_old->create(image_dims.get());
+
+		if (this->x0_.get()){
+			*x = *(this->x0_.get());
+		} else  {
+			clear(x);
+		}
+
+		ARRAY_TYPE encoding_space;
+		REAL reg_res,data_res;
+		encoding_space.create(in->get_dimensions().get());
+		if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+			std::cout << "Iterating..." << std::endl;
+		}
+		for (int i = 0; i < iterations_; i++){
+			if ((i==0) && (!this->x0_.get())){
+				clear(g);
+
+				this->encoding_operator_->mult_MH(in,g);
+				if (precond_.get()) {
+					precond_->apply(g,g);
+					precond_->apply(g,g);
+				}
+
+				*g *=  -this->encoding_operator_->get_weight();
+				data_res = real(dot(in,in));
+				reg_res=REAL(0);
+			} else {
+				this->encoding_operator_->mult_M(x,&encoding_space);
+				axpy(REAL(-1),in,&encoding_space);
+				data_res = real(dot(&encoding_space,&encoding_space));
+				this->encoding_operator_->mult_MH(&encoding_space,g);
+				if (precond_.get()) {
+					precond_->apply(g,g);
+					precond_->apply(g,g);
+				}
+				*g *=  this->encoding_operator_->get_weight();
+			}
+
+			this->add_gradient(x,g); // Adds the gradient from all the regularization operators
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+				std::cout << "Data residual: " << data_res << std::endl;
+			}
+
+			if (non_negativity_constraint_) solver_non_negativity_filter(x,g);
+			ELEMENT_TYPE nabla;
+			if (i==0){
+				ARRAY_TYPE tmp_encoding = *in;
+				this->encoding_operator_->mult_M(g,&tmp_encoding);
+				if (this->x0_.get()){
+					nabla = dot(&encoding_space,&tmp_encoding)/dot(&tmp_encoding,&tmp_encoding);
+				} else {
+					nabla = -dot(in,&tmp_encoding)/dot(&tmp_encoding,&tmp_encoding);
+				}
+			} else {
+				x_old -= *x;
+				*g_old -= *g;
+				ELEMENT_TYPE xx = dot(&x_old,&x_old);
+				ELEMENT_TYPE gx = dot(g_old,&x_old);
+
+				ELEMENT_TYPE nabla1 = xx/gx;
+
+				/* This is the code that enables the adaptive step size.
+	     REAL gg = dot(g_old,&x_old);
+	     REAL nabla2 = gx/gg;
+	     if ((nabla2/nabla1) < 0.5) nabla = nabla2;
+	     else nabla = nabla1;*/
+				nabla = nabla1;
+			}
+
+			ARRAY_TYPE * tmp;
+			tmp=g_old;
+			g_old=g;
+			g=tmp;
+
+			x_old = *x;
+			REAL grad_norm = nrm2(g_old);
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE,ARRAY_TYPE>::OUTPUT_VERBOSE ){
+				std::cout << "Iteration " <<i << ". Gradient norm: " <<  grad_norm << std::endl;
+			}
+			iteration_callback(x,i,data_res,reg_res);
+			axpy(-nabla,g_old,x);
+			if (non_negativity_constraint_) clamp_min(x,REAL(0));
+			if (grad_norm < tc_tolerance_)  break;
+		}
+		delete g,g_old;
+
+		return boost::shared_ptr<ARRAY_TYPE>(x);
+    		}
+
+	// Set preconditioner
+	//
+	/*virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+      precond_ = precond;
+      }*/
+
+	// Set/get maximally allowed number of iterations
+	//
+	virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+	virtual unsigned int get_max_iterations() { return iterations_; }
+
+	// Set/get tolerance threshold for termination criterium
+	//
+	virtual void set_tc_tolerance( REAL tolerance ) { tc_tolerance_ = tolerance; }
+	virtual REAL get_tc_tolerance() { return tc_tolerance_; }
+
+	virtual void set_non_negativity_constraint(bool non_negativity_constraint){
+		non_negativity_constraint_=non_negativity_constraint;
+	}
+
+	virtual void set_dump_residual(bool dump_res){
+		dump_residual = dump_res;
+	}
+	// Set preconditioner
+	//
+
+	virtual void set_preconditioner( boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond ) {
+		precond_ = precond;
+	}
+
+protected:
+	typedef typename std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >::iterator  csIterator;
+	typedef typename std::vector< std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > > >::iterator csGroupIterator;
+
+	virtual void solver_non_negativity_filter(ARRAY_TYPE*,ARRAY_TYPE*)=0;
+	virtual void iteration_callback(ARRAY_TYPE*,int i,REAL,REAL){};
+
+protected:
+
+	// Preconditioner
+	//boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+	// Maximum number of iterations
+	unsigned int iterations_;
+	bool non_negativity_constraint_;
+	REAL tc_tolerance_;
+	REAL threshold;
+	bool dump_residual;
+	// Preconditioner
+	boost::shared_ptr< cgPreconditioner<ARRAY_TYPE> > precond_;
+};
+}
diff --git a/toolboxes/solvers/gpSolver.h b/toolboxes/solvers/gpSolver.h
new file mode 100644
index 0000000..5dee7d5
--- /dev/null
+++ b/toolboxes/solvers/gpSolver.h
@@ -0,0 +1,318 @@
+#pragma once
+
+#include "linearOperatorSolver.h"
+#include "real_utilities.h"
+#include "complext.h"
+
+#include <vector>
+#include <iostream>
+
+namespace Gadgetron{
+
+  /* Using adaptive step size from Zhou et al, 2006, Computational Optimization and Applications,
+   * DOI: 10.1007/s10589-006-6446-0
+   */
+
+  template <class ARRAY_TYPE> class gpSolver : public linearOperatorSolver<ARRAY_TYPE>
+  {
+  protected:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+
+  public:
+
+    virtual void set_domain_dimensions(std::vector<size_t> *dims ){
+      for (int i = 0;  i < operators.size(); i++) operators[i]->set_domain_dimensions(dims);
+    }
+    virtual ~gpSolver(){}
+
+    virtual void add_nonlinear_operator(boost::shared_ptr< generalOperator<ARRAY_TYPE> > op ){
+      operators.push_back(op);
+    }
+
+    virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op ){
+      add_regularization_operator(op,2);
+    }
+    virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, int L_norm ){
+      if (L_norm==1){
+        operators.push_back(boost::shared_ptr<gpRegularizationOperator>(new l1GPRegularizationOperator(op)));
+      }else{
+        operators.push_back(op);
+      }
+    }
+
+    virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE> > op, boost::shared_ptr<ARRAY_TYPE> prior, int L_norm=2 ){
+      if (L_norm==1){
+        operators.push_back(boost::shared_ptr<gpRegularizationOperator>(new l1GPRegularizationOperator(op,prior)));
+      }else{
+        operators.push_back(boost::shared_ptr<gpRegularizationOperator>(new l2GPRegularizationOperator(op,prior)));
+      }
+    }
+
+    virtual void add_regularization_group_operator ( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op )
+    {
+      current_group.push_back(op);
+    }
+
+    virtual void add_group(int L_norm=1)
+    {
+      if(current_group.size()==0){
+        throw std::runtime_error( "Error: gpBBSolver::add_group : no regularization group operators added" );
+      }
+      if (L_norm==2){
+        for (int i =0; i < current_group.size(); i++){
+          add_regularization_operator(current_group[i]);
+        }
+
+      } else {
+
+        boost::shared_ptr<gpRegularizationOperator> new_group(new l1GroupGPRegularizationOperator(current_group));
+        operators.push_back(new_group);
+      }
+      current_group = std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >();
+    }
+
+    virtual void add_group(boost::shared_ptr<ARRAY_TYPE> prior, int L_norm=1)
+    {
+      if(current_group.size()==0){
+        throw std::runtime_error( "Error: gpBBSolver::add_group : no regularization group operators added" );
+
+      }
+      if (L_norm==2){
+        for (int i =0; i < current_group.size(); i++){
+          add_regularization_operator(current_group[i],prior);
+        }
+
+      } else {
+
+        boost::shared_ptr<gpRegularizationOperator> new_group(new l1GroupGPRegularizationOperator(current_group,prior));
+        operators.push_back(new_group);
+      }
+      current_group = std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >();
+    }
+
+  protected:
+
+
+    virtual void add_gradient(ARRAY_TYPE* x, ARRAY_TYPE* g){
+      for (int i = 0; i < operators.size(); i++){
+        boost::shared_ptr<generalOperator<ARRAY_TYPE> > op = operators[i];
+        op->gradient(x,g,true);
+      }
+
+    }
+
+
+
+    class gpRegularizationOperator : public generalOperator<ARRAY_TYPE> {
+    public:
+      gpRegularizationOperator() : generalOperator<ARRAY_TYPE>(){
+      }
+
+      gpRegularizationOperator(std::vector<size_t> *dims): generalOperator<ARRAY_TYPE>(){this->set_domain_dimensions(dims);};
+      gpRegularizationOperator(
+                               boost::shared_ptr<ARRAY_TYPE> _prior): generalOperator<ARRAY_TYPE>(){
+        prior = _prior;
+      }
+
+      gpRegularizationOperator(boost::shared_ptr<ARRAY_TYPE> _prior,std::vector<size_t> *dims): generalOperator<ARRAY_TYPE>(){
+        prior = _prior;
+        set_domain_dimensions(dims);
+      }
+
+
+
+    protected:
+
+      boost::shared_ptr<ARRAY_TYPE> prior;
+
+    };
+
+
+    class l2GPRegularizationOperator : public gpRegularizationOperator {
+    public:
+      l2GPRegularizationOperator(boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op){
+        op = _op;
+      }
+      l2GPRegularizationOperator(
+                                 boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op,
+                                 boost::shared_ptr<ARRAY_TYPE> _prior): gpRegularizationOperator(_prior){op = _op;}
+      virtual void gradient(ARRAY_TYPE* x, ARRAY_TYPE* g,bool accumulate=false){
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          axpy(REAL(-1),this->prior.get(),x2);
+        }
+        op->gradient(x,g,accumulate);
+      }
+
+      virtual REAL magnitude(ARRAY_TYPE* x){
+        ARRAY_TYPE tmp(op->get_codomain_dimensions());
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+        else clear(&tmp);
+        op->mult_M(x2,&tmp);
+        if (this->prior.get()) delete x2;
+        return std::sqrt(op->get_weight())*real(dot(&tmp,&tmp));
+      }
+    protected:
+      boost::shared_ptr<linearOperator<ARRAY_TYPE> > op;
+    };
+
+    class l1GPRegularizationOperator : public gpRegularizationOperator {
+    public:
+      l1GPRegularizationOperator(boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op){
+
+        op = _op;
+      }
+      l1GPRegularizationOperator(
+                                 boost::shared_ptr<linearOperator<ARRAY_TYPE> > _op,
+                                 boost::shared_ptr<ARRAY_TYPE> _prior): gpRegularizationOperator(_prior){op = _op;}
+
+
+      virtual void gradient(ARRAY_TYPE* x, ARRAY_TYPE* g, bool accumulate=false){
+        ARRAY_TYPE tmp(op->get_codomain_dimensions());
+        ARRAY_TYPE q(op->get_domain_dimensions());
+        ARRAY_TYPE* x2 = x;
+
+        if (!accumulate) clear(g);
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE;
+          *x2 = *x;
+          axpy(REAL(-1),this->prior.get(),x2);
+
+        }
+        op->mult_M(x2,&tmp);
+        sgn_inplace(&tmp);
+        op->mult_MH(&tmp,&q,false);
+        axpy(op->get_weight(),&q,g);
+        if (this->prior.get()) delete x2;
+      }
+
+      virtual REAL magnitude(ARRAY_TYPE* x){
+        ARRAY_TYPE tmp(op->get_codomain_dimensions());
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+        op->mult_M(x2,&tmp);
+        if (this->prior.get()) delete x2;
+        return op->get_weight()*asum(&tmp);
+      }
+
+
+      virtual void set_domain_dimensions(std::vector<size_t> *dims){
+        generalOperator<ARRAY_TYPE>::set_domain_dimensions(dims);
+        op->set_domain_dimensions(dims);
+        if (op->get_codomain_dimensions()->size() == 0){
+          std::cout << "WARNING: Codomain dimension not set. Setting to domain_dimension" << std::endl;
+          op->set_codomain_dimensions(dims);
+        }
+      }
+      boost::shared_ptr<linearOperator<ARRAY_TYPE> > op;
+    };
+
+    class l1GroupGPRegularizationOperator : public gpRegularizationOperator {
+    public:
+      l1GroupGPRegularizationOperator(std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >_ops){
+        group = _ops;
+        threshold = REAL(1e-8);
+
+      }
+      l1GroupGPRegularizationOperator(std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > >_ops,
+                                      boost::shared_ptr<ARRAY_TYPE> _prior): gpRegularizationOperator(_prior){
+        group = _ops;
+        threshold = REAL(1e-8);
+
+      }
+      virtual void gradient(ARRAY_TYPE* x, ARRAY_TYPE* g,bool accumulate=false){
+        std::vector<boost::shared_ptr<ARRAY_TYPE> > data;
+        ARRAY_TYPE gData(group.front()->get_codomain_dimensions());
+        clear(&gData);
+
+        if (!accumulate) clear(g);
+        ARRAY_TYPE* x2 = x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          boost::shared_ptr<ARRAY_TYPE> tmp(new ARRAY_TYPE(op->get_codomain_dimensions().get()));
+          op->mult_M(x2,tmp.get());
+          data.push_back(tmp);
+          ARRAY_TYPE tmp2 = *tmp;
+          tmp2 *= *tmp;
+          gData += tmp2;
+        }
+        if (this->prior.get()){
+          delete x2;
+        }
+        sqrt_inplace(&gData);
+        //REAL cost = group.front()->get_weight()*asum(&gData);
+        clamp_min(&gData,threshold);
+        reciprocal_inplace(&gData);
+
+        ARRAY_TYPE q(group.front()->get_domain_dimensions());
+
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          boost::shared_ptr<ARRAY_TYPE> tmp = data[i];
+          *tmp *= gData;
+          op->mult_MH(tmp.get(),&q,false);
+          axpy(op->get_weight(),&q,g);
+        }
+      }
+
+
+      virtual REAL magnitude(ARRAY_TYPE* x){
+        ARRAY_TYPE gData(group.front()->get_codomain_dimensions());
+        clear(&gData);
+        ARRAY_TYPE* x2 =x;
+        if (this->prior.get()){
+          x2 = new ARRAY_TYPE(*x);
+          *x2 -= *this->prior;
+        }
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          ARRAY_TYPE tmp(op->get_codomain_dimensions().get());
+          op->mult_M(x2,&tmp);
+          tmp *= tmp;
+          gData += tmp;
+        }
+        if (this->prior.get()) delete x2;
+        sqrt_inplace(&gData);
+        return group.front()->get_weight()*asum(&gData);
+      }
+
+      void set_threshold(REAL _threshold){
+        threshold = _threshold;
+      }
+
+    protected:
+
+      std::vector<boost::shared_ptr<linearOperator<ARRAY_TYPE> > > group;
+      REAL threshold;
+
+      virtual void set_domain_dimensions(std::vector<size_t> *dims){
+        generalOperator<ARRAY_TYPE>::set_domain_dimensions(dims);
+        for (int i = 0; i < group.size(); i++ ){
+          boost::shared_ptr<linearOperator<ARRAY_TYPE> > op = group[i];
+          op->set_domain_dimensions(dims);
+          if (op->get_codomain_dimensions()->size() == 0){
+            std::cout << "WARNING: Codomain dimension not set. Setting to domain_dimension" << std::endl;
+            op->set_codomain_dimensions(dims);
+          }
+        }
+      }
+    };
+
+    std::vector< boost::shared_ptr< generalOperator<ARRAY_TYPE> > > operators;
+    std::vector< boost::shared_ptr<linearOperator<ARRAY_TYPE> > >  current_group;
+  };
+}
diff --git a/toolboxes/solvers/gpu/CMakeLists.txt b/toolboxes/solvers/gpu/CMakeLists.txt
new file mode 100644
index 0000000..0a66de5
--- /dev/null
+++ b/toolboxes/solvers/gpu/CMakeLists.txt
@@ -0,0 +1,49 @@
+if (WIN32)
+  ADD_DEFINITIONS(-D__BUILD_GADGETRON_GPUSOLVERS__)
+endif (WIN32)
+
+if(WIN32)
+link_directories(${Boost_LIBRARY_DIRS})
+endif(WIN32)
+
+include_directories(
+  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_SOURCE_DIR}/toolboxes/core/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/operators/gpu
+  ${CMAKE_SOURCE_DIR}/toolboxes/solvers
+  )
+
+cuda_add_library(gpusolvers SHARED 
+    cuCgPreconditioner.h
+    cuCgSolver.h
+    cuGpBbSolver.h
+    cuLwSolver.h
+    cuSbcCgSolver.h
+    cuSbCgSolver.h
+    cuSbcLwSolver.h
+    cuSbLwSolver.h
+    gpusolvers_export.h
+    cuGpBbSolver.cu
+  )
+
+target_link_libraries(gpusolvers 
+  gpucore 
+  ${Boost_LIBRARIES}
+  ${CUDA_LIBRARIES}
+  ${CUDA_CUBLAS_LIBRARIES} 
+  )
+
+install(TARGETS gpusolvers DESTINATION lib)
+
+install(FILES 	
+  cuSbCgSolver.h
+  cuSbcCgSolver.h
+  cuCgPreconditioner.h
+  cuLwSolver.h
+  cuSbLwSolver.h
+  cuSbcLwSolver.h
+  cuCgSolver.h
+  cuGpBbSolver.h
+  hoCuGpBbSolver.h
+  gpusolvers_export.h
+  DESTINATION include)
diff --git a/toolboxes/solvers/gpu/cuCgPreconditioner.h b/toolboxes/solvers/gpu/cuCgPreconditioner.h
new file mode 100644
index 0000000..808d9e2
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuCgPreconditioner.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cgPreconditioner.h"
+
+namespace Gadgetron{
+
+  template<class T> class cuCgPreconditioner : public cgPreconditioner< cuNDArray<T> >
+  {
+  public:    
+    cuCgPreconditioner() : cgPreconditioner< cuNDArray<T> >() {}
+    virtual ~cuCgPreconditioner() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuCgSolver.h b/toolboxes/solvers/gpu/cuCgSolver.h
new file mode 100644
index 0000000..071a8ec
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuCgSolver.h
@@ -0,0 +1,30 @@
+/** \file cuCgSolver.h
+    \brief Instantiation of the conjugate gradient solver on the cpu.
+
+    The file cuCgSolver.h is a convienience wrapper for the device independent cgSolver class.
+    The class cuCgSolver instantiates the cgSolver for the cuNDArray
+    and the header otherwise includes other neccessary header files.
+*/
+
+#pragma once
+
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "cgSolver.h"
+
+namespace Gadgetron{
+  
+  /** \class cuCgSolver
+      \brief Instantiation of the conjugate gradient solver on the cpu.
+      
+      The class cuCgSolver is a convienience wrapper for the device independent cgSolver class.
+      cuCgSolver instantiates the cgSolver for type cuNDArray<T>.
+  */
+  template <class T> class cuCgSolver : public cgSolver< cuNDArray<T> >
+  {
+  public:    
+    cuCgSolver() : cgSolver<cuNDArray<T> >() {}
+    virtual ~cuCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuGpBbSolver.cu b/toolboxes/solvers/gpu/cuGpBbSolver.cu
new file mode 100644
index 0000000..6ab3765
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuGpBbSolver.cu
@@ -0,0 +1,40 @@
+#include "cuGpBbSolver.h"
+#include "complext.h"
+
+#define MAX_THREADS_PER_BLOCK 512
+
+using namespace Gadgetron;
+template <class T> __global__ void filter_kernel(T* x, T* g, int elements){
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if (idx < elements){
+    if ( x[idx] <= T(0) && g[idx] > 0) g[idx]=T(0);
+  }
+}
+
+template <class REAL> __global__ void filter_kernel(complext<REAL>* x, complext<REAL>* g, int elements){
+  const int idx = blockIdx.y*gridDim.x*blockDim.x + blockIdx.x*blockDim.x + threadIdx.x;
+  if (idx < elements){
+    if ( real(x[idx]) <= REAL(0) && real(g[idx]) > 0) g[idx].vec[0] = REAL(0);
+    g[idx].vec[1]=REAL(0);
+  }
+}
+
+template <class T> void Gadgetron::cuGpBbSolver<T>::
+solver_non_negativity_filter(Gadgetron::cuNDArray<T>* x , Gadgetron::cuNDArray<T>* g)
+{
+  int elements = g->get_number_of_elements();
+
+  int threadsPerBlock = std::min(elements,MAX_THREADS_PER_BLOCK);
+  dim3 dimBlock( threadsPerBlock);
+  int totalBlocksPerGrid = std::max(1,elements/MAX_THREADS_PER_BLOCK);
+  dim3 dimGrid(totalBlocksPerGrid);
+
+  filter_kernel<typename realType<T>::Type><<<dimGrid,dimBlock>>>(x->get_data_ptr(),g->get_data_ptr(),elements);
+}
+
+
+template class EXPORTGPUSOLVERS Gadgetron::cuGpBbSolver<float>;
+template class EXPORTGPUSOLVERS Gadgetron::cuGpBbSolver<double>;
+
+template class EXPORTGPUSOLVERS Gadgetron::cuGpBbSolver< complext<float> >;
+template class EXPORTGPUSOLVERS Gadgetron::cuGpBbSolver< complext<double> >;
diff --git a/toolboxes/solvers/gpu/cuGpBbSolver.h b/toolboxes/solvers/gpu/cuGpBbSolver.h
new file mode 100644
index 0000000..27c57ec
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuGpBbSolver.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "gpBbSolver.h"
+#include "cuNDArray_operators.h"
+#include "cuNDArray_elemwise.h"
+#include "cuNDArray_blas.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "gpusolvers_export.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+
+namespace Gadgetron{
+  
+  template <class T> class EXPORTGPUSOLVERS cuGpBbSolver : public gpBbSolver<cuNDArray<T> >
+  {
+  public:
+    
+    cuGpBbSolver() : gpBbSolver<cuNDArray<T> >() {}
+    virtual ~cuGpBbSolver() {}
+    
+    virtual void solver_non_negativity_filter(cuNDArray<T> *x,cuNDArray<T> *g);    
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuLwSolver.h b/toolboxes/solvers/gpu/cuLwSolver.h
new file mode 100644
index 0000000..5ad7d5a
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuLwSolver.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "lwSolver.h"
+#include "cuNDArray.h"
+#include "cuNDArray_blas.h"
+
+#include <iostream>
+
+namespace Gadgetron{
+
+  template <class T> class cuLwSolver
+    : public lwSolver<cuNDArray<T> >
+  {
+  public:
+  
+    cuLwSolver() : lwSolver< cuNDArray<T> >() { set_device(-1); }
+    virtual ~cuLwSolver() {}
+  
+    virtual bool set_device( int device )
+    { 
+      device_ = device;
+    
+      if( device<0 ){
+      
+	int old_device;  
+      
+	if( cudaGetDevice( &old_device ) != cudaSuccess ){
+	  std::cerr << "cuLwSolver::set_device: unable to get current device." << std::endl ;
+	  return false;
+	}
+      
+	device_ = old_device;
+      }
+    
+      return true;
+    }
+    
+  protected:
+    int device_;
+    int old_device_;
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuSbCgSolver.h b/toolboxes/solvers/gpu/cuSbCgSolver.h
new file mode 100644
index 0000000..7fa12c8
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbCgSolver.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "cuCgSolver.h"
+#include "sbSolver.h"
+
+#include "complext.h"
+
+namespace Gadgetron{
+
+  template <class T> class cuSbCgSolver : public sbSolver< cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >
+  {
+  public:    
+    cuSbCgSolver() : sbSolver<cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >() {}    
+    virtual ~cuSbCgSolver() {}
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuSbLwSolver.h b/toolboxes/solvers/gpu/cuSbLwSolver.h
new file mode 100644
index 0000000..c54c5e3
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbLwSolver.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "sbSolver.h"
+#include "cuLwSolver.h"
+#include "cuNDArray.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "ndarray_vector_td_utilities.h"
+#include "encodingOperatorContainer.h"
+
+template <class T> class cuSbLwSolver
+  : public sbSolver<cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >
+{
+public:
+  
+  cuSbLwSolver() : sbSolver< cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >() {
+    set_device(-1); 
+  }
+
+  virtual ~cuSbLwSolver() {}
+  
+#include "cuSbSolver_macros.h"
+  
+protected:
+  int device_;
+  int old_device_;
+};
diff --git a/toolboxes/solvers/gpu/cuSbcCgSolver.h b/toolboxes/solvers/gpu/cuSbcCgSolver.h
new file mode 100644
index 0000000..78ece19
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbcCgSolver.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "cuCgSolver.h"
+#include "sbcSolver.h"
+
+namespace Gadgetron{
+  
+  template <class T> class cuSbcCgSolver : public sbcSolver< cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >
+  {
+  public:    
+    cuSbcCgSolver() : sbcSolver<cuNDArray<typename realType<T>::Type >, cuNDArray<T>, cuCgSolver<T> >() {}
+    virtual ~cuSbcCgSolver() {}    
+  };
+}
diff --git a/toolboxes/solvers/gpu/cuSbcLwSolver.h b/toolboxes/solvers/gpu/cuSbcLwSolver.h
new file mode 100644
index 0000000..6dcc218
--- /dev/null
+++ b/toolboxes/solvers/gpu/cuSbcLwSolver.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "sbcSolver.h"
+#include "cuLwSolver.h"
+#include "cuNDArray.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+#include "ndarray_vector_td_utilities.h"
+#include "encodingOperatorContainer.h"
+
+namespace Gadgetron{
+template <class T> class cuSbcLwSolver
+  : public sbcSolver< cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >
+{
+public:
+  
+  cuSbcLwSolver() : sbcSolver<cuNDArray<typename realType<T>::type>, cuNDArray<T>, cuLwSolver<T> >() {
+    set_device(-1); 
+  }
+
+  virtual ~cuSbcLwSolver() {}
+
+#include "cuSbSolver_macros.h"
+
+protected:
+  int device_;
+  int old_device_;
+};
+}
diff --git a/toolboxes/solvers/gpu/gpusolvers_export.h b/toolboxes/solvers/gpu/gpusolvers_export.h
new file mode 100644
index 0000000..1dc1c3e
--- /dev/null
+++ b/toolboxes/solvers/gpu/gpusolvers_export.h
@@ -0,0 +1,18 @@
+/** \file gpusolvers_export.h
+    \brief Required definitions for Windows, importing/exporting dll symbols 
+*/
+
+#ifndef GPUSOLVERS_EXPORT_H_
+#define GPUSOLVERS_EXPORT_H_
+
+#if defined (WIN32)
+#if defined (__BUILD_GADGETRON_GPUSOLVERS__) || defined (gpusolvers_EXPORTS)
+#define EXPORTGPUSOLVERS __declspec(dllexport)
+#else
+#define EXPORTGPUSOLVERS __declspec(dllimport)
+#endif
+#else
+#define EXPORTGPUSOLVERS
+#endif
+
+#endif /* GPUSOLVERS_EXPORT_H_ */
diff --git a/toolboxes/solvers/gpu/hoCuGpBbSolver.h b/toolboxes/solvers/gpu/hoCuGpBbSolver.h
new file mode 100644
index 0000000..75a7db9
--- /dev/null
+++ b/toolboxes/solvers/gpu/hoCuGpBbSolver.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "gpBbSolver.h"
+#include "hoNDArray_operators.h"
+#include "hoNDArray_elemwise.h"
+#include "hoNDArray_blas.h"
+#include "real_utilities.h"
+#include "vector_td_utilities.h"
+
+#ifdef USE_OMP
+#include <omp.h>
+#endif
+
+namespace Gadgetron{
+
+  template <class T> class hoCuGpBbSolver : public gpBbSolver< hoCuNDArray<T> >
+  {  
+  public:
+
+    hoCuGpBbSolver() : gpBbSolver< hoCuNDArray<T> >() {};
+    virtual ~hoCuGpBbSolver() {};
+        
+    virtual void solver_non_negativity_filter(hoCuNDArray<T> *xdata, hoCuNDArray<T> *gdata)
+    {
+      typedef typename realType<T>::Type REAL;
+
+      T* x = xdata->get_data_ptr();
+      T* g = gdata->get_data_ptr();
+
+#ifdef USE_OMP
+#pragma omp parallel for
+#endif
+      for( int i=0; i < xdata->get_number_of_elements(); i++ )
+        if( (real(x[i]) <= REAL(0)) && (real(g[i]) > 0) ) 
+          g[i]=T(0);
+    }
+  };
+}
diff --git a/toolboxes/solvers/linearOperatorSolver.h b/toolboxes/solvers/linearOperatorSolver.h
new file mode 100644
index 0000000..bbfbd7f
--- /dev/null
+++ b/toolboxes/solvers/linearOperatorSolver.h
@@ -0,0 +1,75 @@
+/** \file linearOperatorSolver.h
+    \brief Base class for all of Gadgetron's solvers operating on linear operators.
+*/
+
+#pragma once
+
+#include "solver.h"
+#include "linearOperator.h"
+
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class linearOperatorSolver : public solver<ARRAY_TYPE, ARRAY_TYPE>
+  {
+    
+  public:
+
+    // Constructor
+    linearOperatorSolver() : solver<ARRAY_TYPE,ARRAY_TYPE>() {}
+  
+    // Destructor
+    virtual ~linearOperatorSolver() {}
+
+    // Add encoding operator to solver (only one allowed)
+    virtual void set_encoding_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE> > op)
+    {
+      if( !op.get() ){
+        throw std::runtime_error( "Error: linearOperatorSolver::set_encoding_operator : NULL operator provided" );
+      }     
+      encoding_operator_ = op;    
+    }
+  
+    virtual boost::shared_ptr< linearOperator<ARRAY_TYPE> >
+    get_encoding_operator()
+    {
+      return encoding_operator_;
+    }  
+  
+    // Add linear operator to solver (in addition to the encoding operator)
+    virtual void add_regularization_operator( boost::shared_ptr< linearOperator< ARRAY_TYPE> > op)
+    {
+      if( !op.get() ){
+        throw std::runtime_error( "Error: linearOperatorSolver::add_regularization_operator : NULL operator provided" );
+      }    
+      regularization_operators_.push_back(op);
+    }
+  
+    virtual boost::shared_ptr< linearOperator< ARRAY_TYPE> >
+    get_regularization_operator( unsigned int i )
+    {
+      if( i >= get_number_of_regularization_operators() ){
+        throw std::runtime_error( "Error: linearOperatorSolver::get_regularization_operator : index out of range" );
+      }    
+      return regularization_operators_[i];
+    }  
+  
+    virtual unsigned int get_number_of_regularization_operators()
+    {
+      return regularization_operators_.size();
+    }
+    
+  protected:
+  
+    // Single encoding operator
+    boost::shared_ptr< linearOperator<ARRAY_TYPE> > encoding_operator_;
+  
+    // Vector of linear regularization operators
+    std::vector< boost::shared_ptr< linearOperator<ARRAY_TYPE> > > regularization_operators_;
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::Type REAL;
+  };
+}
diff --git a/toolboxes/solvers/lwSolver.h b/toolboxes/solvers/lwSolver.h
new file mode 100644
index 0000000..e0cc9df
--- /dev/null
+++ b/toolboxes/solvers/lwSolver.h
@@ -0,0 +1,214 @@
+/*
+  An implementation of the "Generalized Landweber Solver" based on the paper
+  "Theory and methods related to the singular-function expansion and Landweber's iteration..."
+  by O.N. Strand, Siam J Numer, Anal. 1974;11(4):798-825.
+*/
+
+#pragma once
+
+#include "linearOperatorSolver.h"
+
+#include <vector>
+#include <iostream>
+
+namespace Gadgetron{
+
+  template <class ARRAY_TYPE> class lwSolver
+    : public linearOperatorSolver< ARRAY_TYPE>
+  {
+
+  protected:
+    typedef typename ARRAY_TYPE::element_type ELEMENT_TYPE;
+    typedef typename realType<ELEMENT_TYPE>::type REAL;
+
+  public:
+
+    // Constructor
+    lwSolver() : linearOperatorSolver<ARRAY_TYPE>() {
+      iterations_ = 3;
+      alpha_ = REAL(1);
+    }
+  
+    // Destructor
+    virtual ~lwSolver() {}
+  
+    // Set/get maximally allowed number of iterations
+    virtual void set_max_iterations( unsigned int iterations ) { iterations_ = iterations; }
+    virtual unsigned int get_max_iterations() { return iterations_; }  
+
+    // Set/get alpha.
+    // Optimally set alpha to 1/(sigma^2), sigma being the largest singular value of the "sum of operators"
+    virtual void set_alpha( REAL alpha ) { alpha_ = alpha; }
+    virtual REAL get_alpha() { return alpha_; }  
+
+    // Inherited solver interface
+    virtual boost::shared_ptr<ARRAY_TYPE> solve( ARRAY_TYPE *b )
+    {   
+      // Initial validity checks
+      //
+
+      std::vector<unsigned int> image_dims = *this->encoding_operator_->get_domain_dimensions();
+
+      if( image_dims.size() == 0 ){
+	throw std::runtime_error("Error: lwSolver::solve : domain dimensions not set on encoding operator" );
+      }
+        
+      // Allocate solution array.
+      // Clear or set to x0 if provided
+      //
+
+      boost::shared_ptr<ARRAY_TYPE> x( new ARRAY_TYPE() );
+      if( this->get_x0().get() ){
+	*x = *(this->get_x0());
+      }
+      else{
+    	x->create( &image_dims );
+    	x->clear();
+      }    
+
+      ARRAY_TYPE x_prev;
+
+      // Main solver iteration loop
+      //
+    
+      for( unsigned int iteration=0; iteration<iterations_; iteration++ ){
+      
+	// Keep previous x for convergence reporting
+	// 
+
+	if( this->output_mode_ >= solver<ARRAY_TYPE, ARRAY_TYPE>::OUTPUT_VERBOSE ){
+	  x_prev = *x;
+
+	}
+      
+	// Compute residual image, i.e. A^T(b-Ax_k)
+	//
+      
+	boost::shared_ptr<ARRAY_TYPE> r = compute_residual_image( x.get(), b );
+
+	// Multiply residual with shaping matrix
+	//
+
+	boost::shared_ptr<ARRAY_TYPE> rr = apply_shaping_matrix( r.get() );
+
+	// Update x
+	//
+	axpy( get_alpha(), rr.get(), x.get() );
+      
+	if( this->output_mode_ >= solver<ARRAY_TYPE, ARRAY_TYPE>::OUTPUT_VERBOSE ){
+	  axpy( ELEMENT_TYPE(-1), x.get(), &x_prev );
+	  std::cout << " iteration: " << iteration << ", delta x: " << solver_asum(&x_prev) << std::endl;
+	}      
+      }
+    
+      return x;
+    }
+
+  protected:
+    virtual boost::shared_ptr<ARRAY_TYPE> compute_residual_image( ARRAY_TYPE *x, ARRAY_TYPE *b )
+    {    
+      // Allocate some temporary storage and the esult array
+      //
+    
+
+      boost::shared_ptr<ARRAY_TYPE> res = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(x->get_dimensions()));
+
+      ARRAY_TYPE tmp_M(b->get_dimensions());
+      ARRAY_TYPE tmp_acc(b->get_dimensions());
+        
+      // Clear accumulation buffer to b
+      tmp_acc = *b;
+    
+      // Apply encoding operator to current solution
+      this->encoding_operator_->mult_M( x, &tmp_M );
+    
+      // Find residual
+      axpy(REAL(-1), &tmp_M, &tmp_acc );
+    
+      // Adjoint residual    
+      this->encoding_operator_->mult_MH( &tmp_acc, res.get());
+      // Apply encoding operator weight
+      *res *= this->encoding_operator_->get_weight();
+    
+      return res;
+    }
+  
+    boost::shared_ptr<ARRAY_TYPE> apply_shaping_matrix( ARRAY_TYPE *r )
+    {
+      //
+      // Apply 6th order polynomial F(lambda) -- see paper referenced at top
+      //
+    
+      // The input residual r is modified (it is an internal implementation variable anyway)
+      //
+
+      // Memory allocation
+      std::vector<unsigned int> image_dims = *this->encoding_operator_->get_domain_dimensions();
+      boost::shared_ptr<ARRAY_TYPE> res = boost::shared_ptr<ARRAY_TYPE>(new ARRAY_TYPE(&image_dims ));
+    
+      // Handle 0th order   
+      *res = *r;
+      *res *=  REAL(31.5);
+
+      // Handle 1th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(-315) );
+    
+      // Handle 2th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(1443.75) );
+
+      // Handle 3th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(-3465) );
+    
+      // Handle 4th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(4504.5) );
+    
+      // Handle 5th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(-3003) );
+    
+      // Handle 6th order
+      apply_shape_matrix_mult_MH_M( r, res.get(), REAL(804.375) );
+    
+      // Return result
+      return res;
+    }
+
+    void apply_shape_matrix_mult_MH_M( ARRAY_TYPE *r, ARRAY_TYPE *acc, REAL w )
+    {
+      // Temporary storage
+      std::vector<unsigned int> image_dims = *this->encoding_operator_->get_domain_dimensions();
+      ARRAY_TYPE tmp_MH_M(&image_dims), tmp_acc(&image_dims);
+    
+      // Apply encoding operator
+      this->encoding_operator_->mult_MH_M( r, &tmp_MH_M );
+    
+      // Accumulate for overall result
+      axpy(get_alpha()*w*this->encoding_operator_->get_weight(), &tmp_MH_M, acc );
+
+      // Accumulate for intermediate (MH_M)^i
+      tmp_acc = tmp_MH_M;
+      tmp_acc *= get_alpha()*this->encoding_operator_->get_weight();
+    
+      // Loop over operators
+      for( unsigned int i=0; i<this->regularization_operators_.size(); i++){
+      
+	// Compute operator mult_MH_M
+	this->regularization_operators_[i]->mult_MH_M( r, &tmp_MH_M );
+      
+	// Accumulate
+	axpy(get_alpha()*w*this->regularization_operators_[i]->get_weight(), &tmp_MH_M, acc );
+
+	// Accumulate for intermediate (MH_M)^i
+	axpy(get_alpha()*this->encoding_operator_->get_weight(), &tmp_MH_M, &tmp_acc );
+      }
+    
+      // Update r
+      *r = tmp_acc;
+    }
+  
+  protected:
+  
+    // Maximum number of iterations
+    unsigned int iterations_;
+    REAL alpha_;
+  };
+}
diff --git a/toolboxes/solvers/sbSolver.h b/toolboxes/solvers/sbSolver.h
new file mode 100644
index 0000000..a502f38
--- /dev/null
+++ b/toolboxes/solvers/sbSolver.h
@@ -0,0 +1,838 @@
+/*
+  An implementation of the "Generalized Split Bregman Algorithm" - sec. 3.2. of the paper
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+ */
+
+#pragma once
+
+#include "linearOperatorSolver.h"
+#include "vector_td_utilities.h"
+#include "encodingOperatorContainer.h"
+#include "identityOperator.h"
+
+#include <vector>
+#include <iostream>
+#include <set>
+
+namespace Gadgetron{
+
+template< class ARRAY_TYPE_REAL,
+class ARRAY_TYPE_ELEMENT,
+class INNER_SOLVER >
+
+class sbSolver : public linearOperatorSolver<ARRAY_TYPE_ELEMENT>
+{
+
+protected:
+
+	typedef typename ARRAY_TYPE_REAL::element_type REAL;
+	typedef typename ARRAY_TYPE_ELEMENT::element_type ELEMENT_TYPE;
+
+	class sbRegularizationOperator{
+
+	public:
+
+		sbRegularizationOperator() {}
+		sbRegularizationOperator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op) { reg_op=op; }
+		virtual ~sbRegularizationOperator(){}
+
+		virtual void initialize(REAL normalization_factor = REAL(1))
+		{
+			d_k = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(reg_op->get_codomain_dimensions()));
+			b_k = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(reg_op->get_codomain_dimensions()));
+			clear(d_k.get());
+			clear(b_k.get());
+			if(prior.get()){
+				p_M = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(reg_op->get_codomain_dimensions()));
+				reg_op->mult_M(prior.get(),p_M.get());
+				*p_M *= normalization_factor;
+			}
+		}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space)
+		{
+			*encoding_space = *d_k;
+			*encoding_space -= *b_k;
+			if(prior.get())
+				*encoding_space += *p_M;
+		}
+
+		virtual void deinitialize()
+		{
+			d_k.reset();
+			b_k.reset();
+			p_M.reset();
+		}
+
+		REAL get_weight(){ return reg_op->get_weight(); }
+		void set_weight(REAL weight){ reg_op->set_weight(weight); }
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT*) = 0;
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT*) = 0;
+
+		virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(){
+			return reg_op->get_codomain_dimensions();
+		}
+
+		virtual void set_prior(boost::shared_ptr<ARRAY_TYPE_ELEMENT> image){ prior=image; }
+
+		boost::shared_ptr< linearOperator< ARRAY_TYPE_ELEMENT> > reg_op;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> d_k;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> b_k;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> p_M;
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior;
+	};
+
+	class sbL1RegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL1RegularizationOperator(boost::shared_ptr< linearOperator< ARRAY_TYPE_ELEMENT> > op) : sbRegularizationOperator(op) {}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_ELEMENT tmp(*this->b_k);
+			this->reg_op->mult_M(u_k,&tmp,true);
+			if (this->prior.get())
+				tmp -= *(this->p_M);
+			shrink1(&tmp,REAL(1)/this->reg_op->get_weight(),this->d_k.get());
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			this->reg_op->mult_M(u_k,this->b_k.get(),true);
+			if (this->prior.get())
+				*(this->b_k) -= *(this->p_M);
+			shrink1(this->b_k.get(),REAL(1)/this->reg_op->get_weight(),this->d_k.get());
+			*this->b_k -= *this->d_k;
+		}
+	};
+
+
+	class sbL0RegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL0RegularizationOperator(boost::shared_ptr< linearOperator< ARRAY_TYPE_ELEMENT> > op,REAL _p = REAL(0.5)) : sbRegularizationOperator(op), p(_p) {}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_ELEMENT tmp(*this->b_k);
+			this->reg_op->mult_M(u_k,&tmp,true);
+			if (this->prior.get())
+				tmp -= *(this->p_M);
+			pshrink(&tmp,REAL(1)/this->reg_op->get_weight(),p,this->d_k.get());
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			this->reg_op->mult_M(u_k,this->b_k.get(),true);
+			if (this->prior.get())
+				*(this->b_k) -= *(this->p_M);
+			pshrink(this->b_k.get(),REAL(1)/this->reg_op->get_weight(),p,this->d_k.get());
+			*this->b_k -= *this->d_k;
+		}
+	protected:
+		REAL p;
+	};
+
+
+	class sbL1GroupRegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL1GroupRegularizationOperator(std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > group)
+	: sbRegularizationOperator()
+	{
+			op_cont = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >
+			(new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>);
+			for (int i = 0; i < group.size(); i++)
+				op_cont->add_operator(group[i]);
+			reg_ops = group;
+			this->reg_op = op_cont;
+	}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space)
+		{
+			for (int i=0; i < reg_ops.size(); i++){
+				ARRAY_TYPE_ELEMENT tmp(codom_dims,encoding_space->get_data_ptr()+op_cont->get_offset(i));
+				tmp = *d_ks[i];
+				tmp -= *b_ks[i];
+				if (this->prior.get())
+					tmp += *p_Ms[i];
+			}
+		}
+
+		virtual void initialize(REAL normalization_factor = REAL(1))
+		{
+			codom_dims = reg_ops.front()->get_codomain_dimensions();
+			d_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			b_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			if (this->prior.get())
+				p_Ms = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			for (int i=0; i<reg_ops.size(); i++){
+				d_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(d_ks[i].get());
+				b_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(b_ks[i].get());
+				if (this->prior.get()){
+					p_Ms[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+					reg_ops[i]->mult_M(this->prior.get(),p_Ms[i].get());
+					*p_Ms[i] *= normalization_factor;
+				}
+			}
+		}
+
+		virtual void deinitialize()
+		{
+			d_ks.clear();
+			b_ks.clear();
+			p_Ms.clear();
+		}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			ARRAY_TYPE_ELEMENT *tmp = new ARRAY_TYPE_ELEMENT[reg_ops.size()];
+			for (int i=0; i<reg_ops.size(); i++) {
+				tmp[i] = *b_ks[i];
+				this->reg_ops[i]->mult_M(u_k,&tmp[i],true);
+				if (this->prior.get())
+					tmp[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square<ELEMENT_TYPE>(&tmp[i]) : s_k += *abs_square<ELEMENT_TYPE>(&tmp[i]);
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				shrinkd(&tmp[i],&s_k,REAL(1)/reg_ops[i]->get_weight(),d_ks[i].get());
+			}
+			delete[] tmp;
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			for (int i=0; i<reg_ops.size(); i++) {
+				this->reg_ops[i]->mult_M(u_k,b_ks[i].get(),true);
+				if (this->prior.get())
+					*b_ks[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square(b_ks[i].get()) : s_k += *abs_square(b_ks[i].get());
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				shrinkd(b_ks[i].get(),&s_k,REAL(1)/reg_ops[i]->get_weight(),d_ks[i].get());
+				*b_ks[i] -= *d_ks[i];
+			}
+		}
+
+		virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(){
+			return reg_ops.front()->get_codomain_dimensions();
+		}
+
+	protected:
+		std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > reg_ops;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > d_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > b_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > p_Ms;
+		boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> > op_cont;
+		boost::shared_ptr< std::vector<size_t> > codom_dims;
+	};
+
+
+	class sbL0GroupRegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+		sbL0GroupRegularizationOperator(std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > group, REAL _p = REAL(0.5))
+	: sbRegularizationOperator(), p(_p)
+	{
+			op_cont = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >
+			(new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>);
+			for (int i = 0; i < group.size(); i++)
+				op_cont->add_operator(group[i]);
+			reg_ops = group;
+			this->reg_op = op_cont;
+	}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space)
+		{
+			for (int i=0; i < reg_ops.size(); i++){
+				ARRAY_TYPE_ELEMENT tmp(codom_dims,encoding_space->get_data_ptr()+op_cont->get_offset(i));
+				tmp = *d_ks[i];
+				tmp -= *b_ks[i];
+				if (this->prior.get())
+					tmp += *p_Ms[i];
+			}
+		}
+
+		virtual void initialize(REAL normalization_factor = REAL(1))
+		{
+			codom_dims = reg_ops.front()->get_codomain_dimensions();
+			d_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			b_ks = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			if (this->prior.get())
+				p_Ms = std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> >(reg_ops.size());
+			for (int i=0; i<reg_ops.size(); i++){
+				d_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(d_ks[i].get());
+				b_ks[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+				clear(b_ks[i].get());
+				if (this->prior.get()){
+					p_Ms[i] = boost::shared_ptr<ARRAY_TYPE_ELEMENT>(new ARRAY_TYPE_ELEMENT(codom_dims));
+					reg_ops[i]->mult_M(this->prior.get(),p_Ms[i].get());
+					*p_Ms[i] *= normalization_factor;
+				}
+			}
+		}
+
+		virtual void deinitialize()
+		{
+			d_ks.clear();
+			b_ks.clear();
+			p_Ms.clear();
+		}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			ARRAY_TYPE_ELEMENT *tmp = new ARRAY_TYPE_ELEMENT[reg_ops.size()];
+			for (int i=0; i<reg_ops.size(); i++) {
+				tmp[i] = *b_ks[i];
+				this->reg_ops[i]->mult_M(u_k,&tmp[i],true);
+				if (this->prior.get())
+					tmp[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square<ELEMENT_TYPE>(&tmp[i]) : s_k += *abs_square<ELEMENT_TYPE>(&tmp[i]);
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				pshrinkd(&tmp[i],&s_k,REAL(1)/reg_ops[i]->get_weight(),p,d_ks[i].get());
+			}
+			delete[] tmp;
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			ARRAY_TYPE_REAL s_k(codom_dims);
+			for (int i=0; i<reg_ops.size(); i++) {
+				this->reg_ops[i]->mult_M(u_k,b_ks[i].get(),true);
+				if (this->prior.get())
+					*b_ks[i] -= *p_Ms[i];
+				(i==0) ? s_k = *abs_square(b_ks[i].get()) : s_k += *abs_square(b_ks[i].get());
+			}
+			sqrt_inplace(&s_k);
+			for (int i=0; i<reg_ops.size(); i++) {
+				pshrinkd(b_ks[i].get(),&s_k,REAL(1)/reg_ops[i]->get_weight(),p,d_ks[i].get());
+				*b_ks[i] -= *d_ks[i];
+			}
+		}
+
+		virtual boost::shared_ptr< std::vector<size_t> > get_codomain_dimensions(){
+			return reg_ops.front()->get_codomain_dimensions();
+		}
+
+	protected:
+		std::vector<boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > > reg_ops;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > d_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > b_ks;
+		std::vector< boost::shared_ptr<ARRAY_TYPE_ELEMENT> > p_Ms;
+		boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> > op_cont;
+		boost::shared_ptr< std::vector<size_t> > codom_dims;
+		REAL p;
+	};
+
+	class sbL2RegularizationOperator : public sbRegularizationOperator
+	{
+	public:
+
+		sbL2RegularizationOperator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op) : sbRegularizationOperator(op) {}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k)
+		{
+			*this->d_k = *this->b_k;
+			this->reg_op->mult_M(u_k,this->d_k.get(),true);
+			if (this->prior.get()){
+				*this->d_k -= *this->p_M;
+			}
+			*(this->d_k) *= REAL(1)/(1+this->reg_op->get_weight());
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k){
+			update_dk(u_k);
+			*(this->b_k) = *(this->d_k);
+			*(this->b_k) *= this->reg_op->get_weight();
+		}
+	};
+
+	class sbNonNegativityOperator : public sbRegularizationOperator
+	{
+	public:
+		sbNonNegativityOperator(): sbRegularizationOperator(){
+			this->reg_op = boost::shared_ptr<identityOperator<ARRAY_TYPE_ELEMENT> >(new identityOperator<ARRAY_TYPE_ELEMENT>);
+		}
+
+		virtual void initialize(boost::shared_ptr< std::vector<size_t> > image_dims,
+				REAL normalization_factor = REAL(1))
+		{
+			sbRegularizationOperator::initialize( normalization_factor);
+			this->reg_op->set_domain_dimensions(image_dims.get());
+			this->reg_op->set_codomain_dimensions(image_dims.get());
+		}
+
+		virtual void update_encoding_space(ARRAY_TYPE_ELEMENT* encoding_space){
+			*encoding_space = *(this->d_k);
+			clamp_min(encoding_space,REAL(0));
+			*encoding_space += *(this->b_k);
+		}
+
+		virtual void update_dk(ARRAY_TYPE_ELEMENT* u_k){
+			*(this->d_k) = *u_k;
+			*(this->d_k) -= (*(this->b_k));
+			clamp_min(this->d_k.get(),REAL(0));
+		}
+
+		virtual void update_dk_bk(ARRAY_TYPE_ELEMENT* u_k){
+			update_dk(u_k);
+			*(this->b_k) += *(this->d_k);
+			*(this->b_k) -= *u_k;
+		}
+	};
+
+	public:
+
+	// Constructor
+	//
+
+	sbSolver() : linearOperatorSolver<ARRAY_TYPE_ELEMENT>()
+	{
+		normalization_mode_ = SB_NORMALIZE_TO_IMAGE_SPACE_IDENTITY;
+		tolerance_ = REAL(0);
+		outer_iterations_ = 10;
+		inner_iterations_ = 1;
+		num_reg_operators_ = 0;
+		inner_solver_ = boost::shared_ptr<INNER_SOLVER>( new INNER_SOLVER() );
+		non_negativity_filter_weight_ = REAL(0);
+		use_x0_ = false;
+	}
+
+	// Destructor
+	//
+
+	virtual ~sbSolver() {}
+
+	// Add regularization operator to group (for isotropic regularization)
+	//
+
+	virtual void add_regularization_group_operator( boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op )
+	{
+		if( !op.get() ){
+			throw std::runtime_error( "Error: sbSolver::add_regularization_group_operator : NULL operator provided" );
+		}
+		current_group_.push_back(op);
+	}
+
+	// Add isotroic regularization group (multiple groups allowed)
+	//
+
+	virtual void add_group(int L_norm=1)
+	{
+		if(current_group_.size()==0){
+			throw std::runtime_error( "Error: sbSolver::add_group : no regularization group operators added" );
+		}
+		if (L_norm==2){
+			for (int i=0; i<current_group_.size(); i++){
+				regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(current_group_[i])));
+			}
+		} else if (L_norm==0){
+			boost::shared_ptr<sbL0GroupRegularizationOperator> group(new sbL0GroupRegularizationOperator(current_group_));
+			regularization_operators_.push_back(group);
+		}else if (L_norm ==1){
+			boost::shared_ptr<sbL1GroupRegularizationOperator> group(new sbL1GroupRegularizationOperator(current_group_));
+			regularization_operators_.push_back(group);
+		} else throw std::runtime_error("Illega L-norm used in add_group");
+		current_group_.clear();
+	}
+
+	virtual void add_group( boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior, int L_norm=1 )
+	{
+		if(current_group_.size()==0){
+			throw std::runtime_error( "Error: sbSolver::add_group : no regularization group operators added" );
+		}
+		if (L_norm==2){
+			for (int i=0; i<current_group_.size(); i++){
+				regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(current_group_[i])));
+				regularization_operators_.back()->set_prior(prior);
+			}
+		} else if (L_norm==0){
+			boost::shared_ptr<sbL0GroupRegularizationOperator> group(new sbL0GroupRegularizationOperator(current_group_));
+			group->set_prior(prior);
+			regularization_operators_.push_back(group);
+		} else if (L_norm==1){
+			boost::shared_ptr<sbL1GroupRegularizationOperator> group(new sbL1GroupRegularizationOperator(current_group_));
+			group->set_prior(prior);
+			regularization_operators_.push_back(group);
+		} else throw std::runtime_error("Illega L-norm used in add_group");
+		current_group_.clear();
+	}
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op, int L_norm=1 ){
+		if (L_norm==1){
+			regularization_operators_.push_back(boost::shared_ptr<sbL1RegularizationOperator>(new sbL1RegularizationOperator(op)));
+		}else if (L_norm == 0){
+			regularization_operators_.push_back(boost::shared_ptr<sbL0RegularizationOperator>(new sbL0RegularizationOperator(op)));
+		}else{
+			regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(op)));
+		}
+	}
+
+	virtual void add_regularization_operator(boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op, boost::shared_ptr<ARRAY_TYPE_ELEMENT> prior, int L_norm=1 ){
+		if (L_norm==1){
+			regularization_operators_.push_back(boost::shared_ptr<sbL1RegularizationOperator>(new sbL1RegularizationOperator(op)));
+			regularization_operators_.back()->set_prior(prior);
+		}else if (L_norm == 0){
+			regularization_operators_.push_back(boost::shared_ptr<sbL0RegularizationOperator>(new sbL0RegularizationOperator(op)));
+			regularization_operators_.back()->set_prior(prior);
+		}else{
+			regularization_operators_.push_back(boost::shared_ptr<sbL2RegularizationOperator>(new sbL2RegularizationOperator(op)));
+			regularization_operators_.back()->set_prior(prior);
+		}
+	}
+
+	// Specify normalization mode.
+	// The default mode is to use image space normalization.
+
+	enum SB_normalization_mode{
+		SB_NO_NORMALIZATION,
+		SB_NORMALIZE_TO_IMAGE_SPACE_IDENTITY
+	};
+
+	virtual void set_normalization_mode( SB_normalization_mode mode ){
+		normalization_mode_ = mode;
+	}
+
+	// Set termination criterium tolerance
+	//
+
+	virtual void set_tc_tolerance( REAL tolerance )
+	{
+		if( tolerance < REAL(0) )
+			this->solver_warning( "Warning: sbSolver::set_tc_tolerence : tolerance cannot be negative. Ignored." );
+		else tolerance_ = tolerance;
+	}
+
+	virtual void set_non_negativity_filter(REAL nnf){
+		non_negativity_filter_weight_ = nnf;
+	}
+
+	// Set/get maximum number of outer Split-Bregman iterations
+	//
+
+	virtual void set_max_outer_iterations( unsigned int iterations ) { outer_iterations_ = iterations; }
+	virtual unsigned int get_max_outer_iterations() { return outer_iterations_; }
+
+	// Set/get maximum number of inner Split-Bregman iterations
+	//
+
+	virtual void set_max_inner_iterations( unsigned int iterations ) { inner_iterations_ = iterations; }
+	virtual unsigned int get_max_inner_iterations() { return inner_iterations_; }
+
+	virtual void set_use_inner_x0(bool use){ use_x0_=use; }
+
+	// Get the inner solver
+	//
+
+	virtual boost::shared_ptr<INNER_SOLVER> get_inner_solver() { return inner_solver_; }
+
+	// Provide the user an option to access u_k right after its update.
+	//
+
+	virtual bool post_linear_solver_callback( ARRAY_TYPE_ELEMENT* ) { return true; }
+
+	//
+	// Main solver interface
+	//
+
+	virtual boost::shared_ptr<ARRAY_TYPE_ELEMENT> solve( ARRAY_TYPE_ELEMENT *_f )
+    		{
+		// Check that operators etc. have been provided and consistent in dimensionality
+		//
+		validate_solver();
+
+		// Define u_k
+		//
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> u_k( new ARRAY_TYPE_ELEMENT(this->encoding_operator_->get_domain_dimensions()) );
+
+		// Use x0 (if provided) as starting solution estimate
+		//
+		if( this->get_x0().get() )
+			*u_k = *(this->get_x0());
+		else
+			clear(u_k.get());
+
+		// Normalize and _then_ initialize (the order matters)
+		boost::shared_ptr<ARRAY_TYPE_ELEMENT> f(new ARRAY_TYPE_ELEMENT(*_f));
+		REAL normalization_factor = normalize_data( f.get() );
+		initialize( normalization_factor );
+
+		// Invoke the core solver
+		//
+		core( tolerance_, outer_iterations_, inner_iterations_, f, u_k);
+
+		// Clean up memory occupied by the operator container and inner solver
+		deinitialize();
+
+		// Undo normalization
+		*u_k /= normalization_factor;
+
+		// ... and return the result
+		//
+		return u_k;
+    		}
+
+	protected:
+
+	//
+	// Everything beyond this point is internal to the implementation
+	// and not intended to be exposed as a public interface
+	//
+
+	// Validate operator
+	//
+
+	virtual void validate_encoding_operator()
+	{
+		boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op = this->get_encoding_operator();
+
+		if( !op.get() ){
+			throw std::runtime_error( "Error: sbSolver::validate_encoding_operator : operator not set" );
+		}
+
+		boost::shared_ptr< std::vector<size_t> > op_dims = op->get_domain_dimensions();
+		if( op_dims->size() == 0 ){
+			throw std::runtime_error( "Error: sbSolver::validate_encoding_operator : encoding operator must have specified domain dimensions" );
+		}
+
+		op_dims = op->get_codomain_dimensions();
+		if( op_dims->size() == 0 ){
+			throw std::runtime_error( "Error: sbSolver::validate_encoding_operator : encoding operator must have specified codomain dimensions" );
+		}
+	}
+
+	// Validate regularization operator
+	//
+
+	virtual void validate_regularization_operators( std::vector<size_t> *image_dims )
+	{
+		if( image_dims->size() == 0 ){
+			throw std::runtime_error( "Error: sbSolver::validate_regularization_operators : empty dimensions vector provided" );
+		}
+
+		for( unsigned int i=0; i<this->regularization_operators_.size(); i++ ){
+
+			boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op = regularization_operators_[i]->reg_op;
+			boost::shared_ptr< std::vector<size_t> > op_dims = op->get_domain_dimensions();
+
+			if( !op.get() ){
+				throw std::runtime_error( "Error: sbSolver::validate_regularization_operators : invalid operator provided" );
+			}
+
+			if( *op_dims != *image_dims ){
+				throw std::runtime_error( "Error: sbSolver::validate_regularization_operators : operator domain dimensions mismatch between encoding and regularization operators" );
+			}
+		}
+	}
+
+	// Check that the solver is set up properly
+	virtual void validate_solver()
+	{
+		// Some tests to check if we are ready to go...
+		//
+
+		validate_encoding_operator();
+		boost::shared_ptr< std::vector<size_t> > op_dims = this->encoding_operator_->get_domain_dimensions();
+		validate_regularization_operators(op_dims.get());
+	}
+
+	// Initialize solver
+	virtual void initialize( REAL normalization_factor = REAL(1) )
+	{
+		// Get image dimensions
+		boost::shared_ptr< std::vector<size_t> > image_dims =
+				this->encoding_operator_->get_domain_dimensions();
+
+		if (non_negativity_filter_weight_ > REAL(0)){
+			regularization_operators_.push_back(boost::shared_ptr<sbNonNegativityOperator>(new sbNonNegativityOperator));
+			regularization_operators_.back()->set_weight(non_negativity_filter_weight_);
+		}
+
+		// Set up inner solver
+		//
+
+		enc_op_container_ = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >( new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>() );
+		inner_solver_->set_encoding_operator( enc_op_container_ );
+		enc_op_container_->add_operator( this->encoding_operator_ );
+
+		// Invoke initialization on all regularization operators
+		//
+
+		for (int i=0; i < regularization_operators_.size(); i++){
+			regularization_operators_[i]->initialize(normalization_factor);
+			enc_op_container_->add_operator( regularization_operators_[i]->reg_op );
+		}
+	}
+
+	// Clean up operator memory in the inner solver
+	// Also restore the weights we temporarily changed
+
+	virtual void deinitialize()
+	{
+		enc_op_container_ = boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> >( new encodingOperatorContainer<ARRAY_TYPE_ELEMENT>);
+		inner_solver_->set_encoding_operator( enc_op_container_ );
+		for (int i=0; i < regularization_operators_.size(); i++){
+			regularization_operators_[i]->deinitialize();
+		}
+		if (non_negativity_filter_weight_ > REAL(0)){
+			regularization_operators_.pop_back();
+		}
+	}
+
+	// The core of the Split Bregman solver.
+	//
+
+	virtual void core( REAL tolerance, unsigned int outer_iterations, unsigned int inner_iterations,
+			boost::shared_ptr<ARRAY_TYPE_ELEMENT> f,
+			boost::shared_ptr<ARRAY_TYPE_ELEMENT> u_k )
+	{
+		// Image space dimensions
+		boost::shared_ptr< std::vector<size_t> > image_dims =
+				this->encoding_operator_->get_domain_dimensions();
+
+		// Keep a copy of the "previous" u_k to compute the outer loop change of u_k
+		//
+
+		ARRAY_TYPE_ELEMENT u_k_prev;
+		if( tolerance > REAL(0) || this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+			u_k_prev = *u_k;
+		}
+
+		//
+		// Outer loop
+		//
+
+		for( unsigned int outer_iteration=0; outer_iteration<outer_iterations; outer_iteration++ ) {
+
+			if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_MAX )
+				std::cout << std::endl << "SB outer loop iteration " << outer_iteration << std::endl << std::endl;
+
+			//
+			// Inner loop
+			//
+
+			for( unsigned int inner_iteration=0; inner_iteration<inner_iterations; inner_iteration++ ) {
+
+				if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_MAX )
+					std::cout << std::endl << "SB inner loop iteration " << inner_iteration << std::endl << std::endl;
+
+				{ // Brackets used to free 'data' below as soon as it goes out of scope
+
+					// Setup input vector to the encoding operator container (argument to the inner solver's solve)
+					//
+
+					ARRAY_TYPE_ELEMENT data(enc_op_container_->get_codomain_dimensions());
+					ARRAY_TYPE_ELEMENT tmp(f->get_dimensions().get(), data.get_data_ptr() );
+
+					tmp = *f;
+
+					// Next add the regularization operators' data, d_k - b_k
+					//
+
+					for( unsigned int i=0; i< regularization_operators_.size(); i++ ){
+						boost::shared_ptr<sbRegularizationOperator > op = regularization_operators_[i];
+						tmp.create( op->get_codomain_dimensions(), data.get_data_ptr()+enc_op_container_->get_offset(i+1) );
+						op->update_encoding_space(&tmp);
+					}
+
+					// Solve for u_k
+					//
+
+					{
+						if (use_x0_){
+							get_inner_solver()->set_x0(u_k);
+						}
+
+						boost::shared_ptr<ARRAY_TYPE_ELEMENT> tmp_u_k =
+								get_inner_solver()->solve( &data );
+
+						// Invoke the post inner solver callback
+						post_linear_solver_callback( tmp_u_k.get() );
+
+						// Compute change in u_k
+						if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+							*u_k -= *tmp_u_k;
+							std::cout << "u_k delta l2-norm (inner loop): " << nrm2(u_k.get()) << std::endl;
+						}
+
+						// Update u_k
+						*u_k = *tmp_u_k;
+					}
+				}
+
+				// Update d_k (and b_k in final inner iteration)
+				//
+
+				for( unsigned int i=0; i< regularization_operators_.size(); i++ ){
+					boost::shared_ptr<sbRegularizationOperator > op = regularization_operators_[i];
+					if( inner_iteration < inner_iterations-1 )
+						op->update_dk(u_k.get());
+					else
+						op->update_dk_bk(u_k.get());
+				}
+			} // end of inner loop
+
+			// Output change in u_k
+			if( tolerance > REAL(0) || this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+				u_k_prev *= ELEMENT_TYPE(-1);
+				u_k_prev += *u_k;
+				REAL delta = nrm2(&u_k_prev);
+
+				if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE )
+					std::cout << "u_k delta l2-norm (outer loop): " << delta << std::endl << std::endl;
+
+				if( delta < tolerance )
+					break;
+
+				u_k_prev = *u_k;
+			}
+		} // end of outer loop
+	}
+
+	virtual REAL normalize_data( ARRAY_TYPE_ELEMENT *f )
+	{
+		REAL image_scale = REAL(1);
+
+		if( normalization_mode_ == SB_NORMALIZE_TO_IMAGE_SPACE_IDENTITY ){
+
+			//
+			// Normalize to an average energy of "one intensity unit per image element"
+			//
+
+			boost::shared_ptr< linearOperator<ARRAY_TYPE_ELEMENT> > op = this->encoding_operator_;
+			ARRAY_TYPE_ELEMENT tmp( op->get_domain_dimensions() );
+			op->mult_MH( f, &tmp );
+			REAL sum = asum( &tmp );
+			image_scale = REAL(tmp.get_number_of_elements())/sum;
+			*f *= image_scale;
+		}
+
+		return image_scale;
+	}
+
+	protected:
+	SB_normalization_mode normalization_mode_;
+	REAL tolerance_;
+	unsigned int outer_iterations_, inner_iterations_;
+	unsigned int num_reg_operators_;
+	std::vector< boost::shared_ptr<sbRegularizationOperator> > regularization_operators_;
+	std::vector< boost::shared_ptr<linearOperator<ARRAY_TYPE_ELEMENT> > > current_group_;
+	boost::shared_ptr<INNER_SOLVER> inner_solver_;
+	boost::shared_ptr<encodingOperatorContainer<ARRAY_TYPE_ELEMENT> > enc_op_container_;
+	std::vector<unsigned int> weights_backup_;
+	REAL non_negativity_filter_weight_;
+	bool use_x0_;
+};
+}
diff --git a/toolboxes/solvers/sbcSolver.h b/toolboxes/solvers/sbcSolver.h
new file mode 100644
index 0000000..689dfd9
--- /dev/null
+++ b/toolboxes/solvers/sbcSolver.h
@@ -0,0 +1,96 @@
+/*
+  An implementation of the constrained solver of the paper
+  "The Split Bregman Method for L1-Regularized Problems" by Tom Goldstein and Stanley Osher. 
+  Siam J. Imaging Sciences. Vol. 2, No. 2, pp. 323-343.
+*/
+
+#pragma once
+
+#include "sbSolver.h"
+
+namespace Gadgetron{
+
+  template<class ARRAY_TYPE_REAL,
+	   class ARRAY_TYPE_ELEMENT, 
+	   class INNER_SOLVER>
+  class sbcSolver : public sbSolver<ARRAY_TYPE_REAL, ARRAY_TYPE_ELEMENT, INNER_SOLVER>
+  {
+  protected:
+    typedef typename ARRAY_TYPE_REAL::element_type REAL;
+    
+  public:
+  
+    sbcSolver() : sbSolver<ARRAY_TYPE_REAL, ARRAY_TYPE_ELEMENT, INNER_SOLVER>() {}
+    virtual ~sbcSolver() {}
+    
+    virtual boost::shared_ptr<ARRAY_TYPE_ELEMENT> solve( ARRAY_TYPE_ELEMENT *_f )
+    {
+      // Check if everything is set up right
+      //
+      this->validate_solver();
+
+      // Define u_k
+      //
+      boost::shared_ptr<ARRAY_TYPE_ELEMENT> u_k( new ARRAY_TYPE_ELEMENT(this->encoding_operator_->get_domain_dimensions()));
+
+      // Use x0 (if provided) as starting estimate
+      if(this->get_x0().get())
+	*u_k = *(this->get_x0());
+      else 
+	clear(u_k.get());
+
+
+      // Normalize and _then_ initialize (the order matters)
+      //
+      
+      boost::shared_ptr<ARRAY_TYPE_ELEMENT> f(new ARRAY_TYPE_ELEMENT(*_f));
+      REAL normalization_factor = this->normalize_data( f.get() );
+      boost::shared_ptr<ARRAY_TYPE_ELEMENT> f_k(new ARRAY_TYPE_ELEMENT(*f));
+      this->initialize( normalization_factor );
+        
+      // Outer loop
+      //
+
+      for( unsigned int outer_iteration=0; outer_iteration<this->outer_iterations_; outer_iteration++ ) {
+      
+	if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_MAX )
+	  std::cout << std::endl << "SBC outer loop iteration " << outer_iteration << std::endl << std::endl;
+	
+	// Invoke the core solver
+	//
+	
+	this->core( this->tolerance_, this->inner_iterations_, 1, f_k, u_k );
+
+	// Update f_k
+	//
+
+	ARRAY_TYPE_ELEMENT encoded_image(f->get_dimensions());
+	this->encoding_operator_->mult_M( u_k.get(), &encoded_image );
+	encoded_image -= *f;
+
+	if( this->tolerance_ > REAL(0) || this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE ){
+	
+	  REAL delta = nrm2(&encoded_image);
+	
+	  if( this->output_mode_ >= solver<ARRAY_TYPE_ELEMENT, ARRAY_TYPE_ELEMENT>::OUTPUT_VERBOSE )
+	    std::cout << "Squared residual norm (outer loop): " << delta*delta << std::endl << std::endl;
+	  
+	  if( delta < this->tolerance_ )
+	    break;
+	}
+
+	*f_k -= encoded_image;
+      
+      } // end of outer loop
+        
+      // Clean up memory occupied by the operator container and inner solver
+      this->deinitialize();
+
+      // Undo normalization
+      *u_k /= normalization_factor;
+      
+      // ... and return the result
+      return u_k;
+    }  
+  };
+}
diff --git a/toolboxes/solvers/solver.h b/toolboxes/solvers/solver.h
new file mode 100644
index 0000000..a7df7f5
--- /dev/null
+++ b/toolboxes/solvers/solver.h
@@ -0,0 +1,51 @@
+/** \file solver.h
+    \brief Base class for all Gadgetron solvers.
+*/
+
+#pragma once
+
+#include <boost/smart_ptr.hpp>
+#include <string>
+#include <iostream>
+
+namespace Gadgetron
+{
+
+  template <class ARRAY_TYPE_IN, class ARRAY_TYPE_OUT> class solver
+  {
+  public:
+
+    // Constructor/destructor
+    solver() { output_mode_ = OUTPUT_SILENT; }
+    virtual ~solver() {}
+  
+    // Output modes
+    enum solverOutputModes { OUTPUT_SILENT = 0, OUTPUT_WARNINGS = 1, OUTPUT_VERBOSE = 2, OUTPUT_MAX = 3 };
+  
+    // Set/get output mode
+    virtual int get_output_mode() { return output_mode_; }
+    virtual void set_output_mode( int output_mode ) {
+      if( !(output_mode >= OUTPUT_MAX || output_mode < 0 )) 
+	output_mode_ = output_mode;
+    }
+  
+    // Set/get starting solution/estimate for solver
+    virtual void set_x0( boost::shared_ptr<ARRAY_TYPE_OUT> x0 ){ x0_ = x0; }
+    virtual boost::shared_ptr<ARRAY_TYPE_OUT> get_x0(){ return x0_; }
+
+    virtual void solver_warning(std::string warn){
+      std::cout << warn << std::endl;
+    }
+
+    // Invoke solver
+    virtual boost::shared_ptr<ARRAY_TYPE_OUT> solve( ARRAY_TYPE_IN* ) = 0;
+ 
+    void* operator new(size_t bytes) { return ::new char[bytes]; }
+    void* operator new(size_t s, void * p) { return p; }
+    void operator delete(void *ptr) { delete[] static_cast<char*> (ptr); }
+
+  protected:
+    int output_mode_;
+    boost::shared_ptr<ARRAY_TYPE_OUT> x0_;
+  };
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gadgetron.git



More information about the debian-med-commit mailing list